From: Steven Rostedt <rostedt@goodmis.org>
As of commit 654ced4a1377 ("tracing: Introduce tracepoint_is_faultable()")
system call trace events allow faulting in user space memory. Have some of
the system call trace events take advantage of this.
Introduce a way to read strings that are nul terminated into the trace
event. The way this is accomplished is by creating a per CPU temporary
buffer that is used to read unsafe user memory.
When a syscall trace event needs to read user memory, it reads a per CPU
counter that gets updated every time a user space task is scheduled on the
CPU. It then enables preemption, copies the user space memory into this
buffer, then disables preemption again. If the counter is less than two
from its original value the buffer is valid. Otherwise it needs to try
again.
The reason to check for less than two and not equal to the previous value
is because scheduling kernel tasks is fine. Only user space tasks will
write to this buffer. If the task schedules out and only kernel tasks run
and the tasks schedules back in, the counter will be incremented by one.
The syscall event has its nb_args shorten from an int to a short (where
even u8 is plenty big enough) and the freed two bytes are used for
"user_mask". The new "user_mask" field is used to store the index of the
"args" field array that has the address to read from user space. This
value is set to 0 if the system call event does not need to read user
space for a field. This mask can be used to know if the event may fault or
not. Only one bit set in user_mask is supported at this time.
This allows the output to look like this:
sys_access(filename: 0x7f8c55368470 "/etc/ld.so.preload", mode: 4)
sys_execve(filename: 0x564ebcf5a6b8 "/usr/bin/emacs", argv: 0x7fff357c0300, envp: 0x564ebc4a4820)
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/trace/syscall.h | 4 +-
kernel/trace/trace_syscalls.c | 496 ++++++++++++++++++++++++++++++++--
2 files changed, 480 insertions(+), 20 deletions(-)
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 8e193f3a33b3..85f21ca15a41 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -16,6 +16,7 @@
* @name: name of the syscall
* @syscall_nr: number of the syscall
* @nb_args: number of parameters it takes
+ * @user_mask: mask of @args that will read user space
* @types: list of types as strings
* @args: list of args as strings (args[i] matches types[i])
* @enter_fields: list of fields for syscall_enter trace event
@@ -25,7 +26,8 @@
struct syscall_metadata {
const char *name;
int syscall_nr;
- int nb_args;
+ short nb_args;
+ short user_mask;
const char **types;
const char **args;
struct list_head enter_fields;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 0f932b22f9ec..3233319ce266 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -10,6 +10,8 @@
#include <linux/xarray.h>
#include <asm/syscall.h>
+#include <trace/events/sched.h>
+
#include "trace_output.h"
#include "trace.h"
@@ -123,6 +125,9 @@ const char *get_syscall_name(int syscall)
return entry->name;
}
+/* Added to user strings when max limit is reached */
+#define EXTRA "..."
+
static enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_event *event)
@@ -132,7 +137,8 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_entry *ent = iter->ent;
struct syscall_trace_enter *trace;
struct syscall_metadata *entry;
- int i, syscall;
+ int i, syscall, val;
+ unsigned char *ptr;
trace = (typeof(trace))ent;
syscall = trace->nr;
@@ -167,6 +173,17 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
else
trace_seq_printf(s, "%s: 0x%lx", entry->args[i],
trace->args[i]);
+
+ if (!(BIT(i) & entry->user_mask))
+ continue;
+
+ /* This arg points to a user space string */
+ ptr = (void *)trace->args + sizeof(long) * entry->nb_args;
+ val = *(int *)ptr;
+
+ ptr = (void *)ent + (val & 0xffff);
+
+ trace_seq_printf(s, " \"%s\"", ptr);
}
trace_seq_putc(s, ')');
@@ -223,15 +240,27 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
for (i = 0; i < entry->nb_args; i++) {
- pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
- entry->args[i], sizeof(unsigned long),
- i == entry->nb_args - 1 ? "" : ", ");
+ if (i)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ", ");
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx",
+ entry->args[i], sizeof(unsigned long));
+
+ if (!(BIT(i) & entry->user_mask))
+ continue;
+
+ /* Add the format for the user space string */
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
}
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
for (i = 0; i < entry->nb_args; i++) {
pos += snprintf(buf + pos, LEN_OR_ZERO,
", ((unsigned long)(REC->%s))", entry->args[i]);
+ if (!(BIT(i) & entry->user_mask))
+ continue;
+ /* The user space string for arg has name __<arg>_val */
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
+ entry->args[i]);
}
#undef LEN_OR_ZERO
@@ -277,8 +306,12 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
+ unsigned long mask;
+ char *arg;
int offset = offsetof(typeof(trace), args);
+ int idx;
int ret = 0;
+ int len;
int i;
for (i = 0; i < meta->nb_args; i++) {
@@ -291,9 +324,252 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
offset += sizeof(unsigned long);
}
+ if (ret || !meta->user_mask)
+ return ret;
+
+ mask = meta->user_mask;
+ idx = ffs(mask) - 1;
+
+ /*
+ * User space strings are faulted into a temporary buffer and then
+ * added as a dynamic string to the end of the event.
+ * The user space string name for the arg pointer is "__<arg>_val".
+ */
+ len = strlen(meta->args[idx]) + sizeof("___val");
+ arg = kmalloc(len, GFP_KERNEL);
+ if (WARN_ON_ONCE(!arg)) {
+ meta->user_mask = 0;
+ return -ENOMEM;
+ }
+
+ snprintf(arg, len, "__%s_val", meta->args[idx]);
+
+ ret = trace_define_field(call, "__data_loc char[]",
+ arg, offset, sizeof(int), 0,
+ FILTER_OTHER);
+ if (ret)
+ kfree(arg);
return ret;
}
+struct syscall_buf {
+ char *buf;
+};
+
+struct syscall_buf_info {
+ struct rcu_head rcu;
+ struct syscall_buf __percpu *sbuf;
+};
+
+/* Create a per CPU temporary buffer to copy user space pointers into */
+#define SYSCALL_FAULT_BUF_SZ 512
+static struct syscall_buf_info *syscall_buffer;
+static DEFINE_PER_CPU(unsigned long, sched_switch_cnt);
+
+static int syscall_fault_buffer_cnt;
+
+static void syscall_fault_buffer_free(struct syscall_buf_info *sinfo)
+{
+ char *buf;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ buf = per_cpu_ptr(sinfo->sbuf, cpu)->buf;
+ kfree(buf);
+ }
+ kfree(sinfo);
+}
+
+static void rcu_free_syscall_buffer(struct rcu_head *rcu)
+{
+ struct syscall_buf_info *sinfo = container_of(rcu, struct syscall_buf_info, rcu);
+
+ syscall_fault_buffer_free(sinfo);
+}
+
+/*
+ * The per CPU buffer syscall_fault_buffer is written to optimstically.
+ * The counter sched_switch_cnt is taken, preemption is enabled,
+ * the copying of the user space memory is placed into the syscall_fault_buffer,
+ * Preeption is re-enabled and the count is read again. If the count is greater
+ * than one from its previous reading, it means that another user space
+ * task scheduled in and the buffer is unreliable for use.
+ */
+static void
+probe_sched_switch(void *ignore, bool preempt,
+ struct task_struct *prev, struct task_struct *next,
+ unsigned int prev_state)
+{
+ /*
+ * The buffer can only be corrupted by another user space task.
+ * Ignore kernel tasks that may be scheduled in order to process
+ * the faulting memory.
+ */
+ if (next->flags & (PF_KTHREAD | PF_USER_WORKER))
+ return;
+
+ this_cpu_inc(sched_switch_cnt);
+}
+
+static int syscall_fault_buffer_enable(void)
+{
+ struct syscall_buf_info *sinfo;
+ char *buf;
+ int cpu;
+ int ret;
+
+ lockdep_assert_held(&syscall_trace_lock);
+
+ if (syscall_fault_buffer_cnt++)
+ return 0;
+
+ sinfo = kmalloc(sizeof(sinfo), GFP_KERNEL);
+ if (!sinfo)
+ return -ENOMEM;
+
+ sinfo->sbuf = alloc_percpu(struct syscall_buf);
+ if (!sinfo->sbuf) {
+ kfree(sinfo);
+ return -ENOMEM;
+ }
+
+ /* Clear each buffer in case of error */
+ for_each_possible_cpu(cpu) {
+ per_cpu_ptr(sinfo->sbuf, cpu)->buf = NULL;
+ }
+
+ for_each_possible_cpu(cpu) {
+ buf = kmalloc_node(SYSCALL_FAULT_BUF_SZ, GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!buf) {
+ syscall_fault_buffer_free(sinfo);
+ return -ENOMEM;
+ }
+ per_cpu_ptr(sinfo->sbuf, cpu)->buf = buf;
+ }
+
+ ret = register_trace_sched_switch(probe_sched_switch, NULL);
+ if (ret < 0) {
+ syscall_fault_buffer_free(sinfo);
+ return ret;
+ }
+ WRITE_ONCE(syscall_buffer, sinfo);
+ return 0;
+}
+
+static void syscall_fault_buffer_disable(void)
+{
+ struct syscall_buf_info *sinfo = syscall_buffer;
+
+ lockdep_assert_held(&syscall_trace_lock);
+
+ if (--syscall_fault_buffer_cnt)
+ return;
+
+ WRITE_ONCE(syscall_buffer, NULL);
+
+ unregister_trace_sched_switch(probe_sched_switch, NULL);
+ call_rcu_tasks_trace(&sinfo->rcu, rcu_free_syscall_buffer);
+}
+
+static char *sys_fault_user(struct syscall_metadata *sys_data, struct syscall_buf_info *sinfo,
+ unsigned long *args, unsigned int *data_size)
+{
+ char *buf = per_cpu_ptr(sinfo->sbuf, smp_processor_id())->buf;
+ unsigned long size = SYSCALL_FAULT_BUF_SZ - 1;
+ unsigned long mask = sys_data->user_mask;
+ unsigned int cnt;
+ int idx = ffs(mask) - 1;
+ char *ptr;
+ int trys = 0;
+ int ret;
+
+ /* Get the pointer to user space memory to read */
+ ptr = (char *)args[idx];
+ *data_size = 0;
+
+ again:
+ /*
+ * If this task is preempted by another user space task, it
+ * will cause this task to try again. But just in case something
+ * changes where the copying from user space causes another task
+ * to run, prevent this from going into an infinite loop.
+ * 10 tries should be plenty.
+ */
+ if (trys++ > 10) {
+ static bool once;
+ /*
+ * Only print a message instead of a WARN_ON() as this could
+ * theoretically trigger under real load.
+ */
+ if (!once)
+ pr_warn("Error: Too many tries to read syscall %s\n", sys_data->name);
+ once = true;
+ return buf;
+ }
+
+ /* Read the current sched switch count */
+ cnt = this_cpu_read(sched_switch_cnt);
+
+ /*
+ * Preemption is going to be enabled, but this task must
+ * remain on this CPU.
+ */
+ migrate_disable();
+
+ /*
+ * Now preemption is being enabed and another task can come in
+ * and use the same buffer and corrupt our data.
+ */
+ preempt_enable_notrace();
+
+ ret = strncpy_from_user(buf, ptr, size);
+
+ preempt_disable_notrace();
+ migrate_enable();
+
+ /* If it faulted, no use to try again */
+ if (ret < 0)
+ return buf;
+
+ /*
+ * Preemption is disabled again, now check the sched_switch_cnt.
+ * If it increased by two or more, then another user space process
+ * may have schedule in and corrupted our buffer. In that case
+ * the copying must be retried.
+ *
+ * Note, if this task was scheduled out and only kernel threads
+ * were scheduled in (maybe to process the fault), then the
+ * counter would increment again when this task scheduled in.
+ * If this task scheduled out and another user task scheduled
+ * in, this task would still need to be scheduled back in and
+ * the counter would increment by at least two.
+ */
+ if (this_cpu_read(sched_switch_cnt) > cnt + 1)
+ goto again;
+
+ /* Replace any non-printable characters with '.' */
+ for (int i = 0; i < ret; i++) {
+ if (!isprint(buf[i]))
+ buf[i] = '.';
+ }
+
+ /*
+ * If the text was truncated due to our max limit, add "..." to
+ * the string.
+ */
+ if (ret > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) {
+ strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA),
+ EXTRA, sizeof(EXTRA));
+ ret = SYSCALL_FAULT_BUF_SZ;
+ } else {
+ buf[ret++] = '\0';
+ }
+
+ *data_size = ret;
+ return buf;
+}
+
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
struct trace_array *tr = data;
@@ -302,15 +578,17 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct trace_event_buffer fbuffer;
unsigned long args[6];
+ char *user_ptr;
+ int user_size = 0;
int syscall_nr;
- int size;
+ int size = 0;
+ bool mayfault;
/*
* Syscall probe called with preemption enabled, but the ring
* buffer and per-cpu data require preemption to be disabled.
*/
might_fault();
- guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -327,7 +605,32 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
if (!sys_data)
return;
- size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+ /* Check if this syscall event faults in user space memory */
+ mayfault = sys_data->user_mask != 0;
+
+ guard(preempt_notrace)();
+
+ syscall_get_arguments(current, regs, args);
+
+ if (mayfault) {
+ struct syscall_buf_info *sinfo;
+
+ /* If the syscall_buffer is NULL, tracing is being shutdown */
+ sinfo = READ_ONCE(syscall_buffer);
+ if (!sinfo)
+ return;
+
+ user_ptr = sys_fault_user(sys_data, sinfo, args, &user_size);
+ /*
+ * user_size is the amount of data to append.
+ * Need to add 4 for the meta field that points to
+ * the user memory at the end of the event and also
+ * stores its size.
+ */
+ size = 4 + user_size;
+ }
+
+ size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
if (!entry)
@@ -335,9 +638,36 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
entry = ring_buffer_event_data(fbuffer.event);
entry->nr = syscall_nr;
- syscall_get_arguments(current, regs, args);
+
memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
+ if (mayfault) {
+ void *ptr;
+ int val;
+
+ /*
+ * Set the pointer to point to the meta data of the event
+ * that has information about the stored user space memory.
+ */
+ ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
+
+ /*
+ * The meta data will store the offset of the user data from
+ * the beginning of the event.
+ */
+ val = (ptr - (void *)entry) + 4;
+
+ /* Store the offset and the size into the meta data */
+ *(int *)ptr = val | (user_size << 16);
+
+ /* Nothing to do if the user space was empty or faulted */
+ if (user_size) {
+ /* Now store the user space data into the event */
+ ptr += 4;
+ memcpy(ptr, user_ptr, user_size);
+ }
+ }
+
trace_event_buffer_commit(&fbuffer);
}
@@ -386,39 +716,50 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
static int reg_event_syscall_enter(struct trace_event_file *file,
struct trace_event_call *call)
{
+ struct syscall_metadata *sys_data = call->data;
struct trace_array *tr = file->tr;
int ret = 0;
int num;
- num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ num = sys_data->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
- mutex_lock(&syscall_trace_lock);
- if (!tr->sys_refcount_enter)
+ guard(mutex)(&syscall_trace_lock);
+ if (sys_data->user_mask) {
+ ret = syscall_fault_buffer_enable();
+ if (ret)
+ return ret;
+ }
+ if (!tr->sys_refcount_enter) {
ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
- if (!ret) {
- WRITE_ONCE(tr->enter_syscall_files[num], file);
- tr->sys_refcount_enter++;
+ if (ret < 0) {
+ if (sys_data->user_mask)
+ syscall_fault_buffer_disable();
+ return ret;
+ }
}
- mutex_unlock(&syscall_trace_lock);
- return ret;
+ WRITE_ONCE(tr->enter_syscall_files[num], file);
+ tr->sys_refcount_enter++;
+ return 0;
}
static void unreg_event_syscall_enter(struct trace_event_file *file,
struct trace_event_call *call)
{
+ struct syscall_metadata *sys_data = call->data;
struct trace_array *tr = file->tr;
int num;
- num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ num = sys_data->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
- mutex_lock(&syscall_trace_lock);
+ guard(mutex)(&syscall_trace_lock);
tr->sys_refcount_enter--;
WRITE_ONCE(tr->enter_syscall_files[num], NULL);
if (!tr->sys_refcount_enter)
unregister_trace_sys_enter(ftrace_syscall_enter, tr);
- mutex_unlock(&syscall_trace_lock);
+ if (sys_data->user_mask)
+ syscall_fault_buffer_disable();
}
static int reg_event_syscall_exit(struct trace_event_file *file,
@@ -459,6 +800,121 @@ static void unreg_event_syscall_exit(struct trace_event_file *file,
mutex_unlock(&syscall_trace_lock);
}
+/*
+ * For system calls that reference user space memory that can
+ * be recorded into the event, set the system call meta data's user_mask
+ * to the "args" index that points to the user space memory to retrieve.
+ */
+static void check_faultable_syscall(struct trace_event_call *call, int nr)
+{
+ struct syscall_metadata *sys_data = call->data;
+
+ /* Only work on entry */
+ if (sys_data->enter_event != call)
+ return;
+
+ switch (nr) {
+ /* user arg at position 0 */
+ case __NR_access:
+ case __NR_acct:
+ case __NR_add_key: /* Just _type. TODO add _description */
+ case __NR_chdir:
+ case __NR_chown:
+ case __NR_chmod:
+ case __NR_chroot:
+ case __NR_creat:
+ case __NR_delete_module:
+ case __NR_execve:
+ case __NR_fsopen:
+ case __NR_getxattr: /* Just pathname, TODO add name */
+ case __NR_lchown:
+ case __NR_lgetxattr: /* Just pathname, TODO add name */
+ case __NR_lremovexattr: /* Just pathname, TODO add name */
+ case __NR_link: /* Just oldname. TODO add newname */
+ case __NR_listxattr: /* Just pathname, TODO add list */
+ case __NR_llistxattr: /* Just pathname, TODO add list */
+ case __NR_lsetxattr: /* Just pathname, TODO add list */
+ case __NR_open:
+ case __NR_memfd_create:
+ case __NR_mount: /* Just dev_name, TODO add dir_name and type */
+ case __NR_mkdir:
+ case __NR_mknod:
+ case __NR_mq_open:
+ case __NR_mq_unlink:
+ case __NR_pivot_root: /* Just new_root, TODO add old_root */
+ case __NR_readlink:
+ case __NR_removexattr: /* Just pathname, TODO add name */
+ case __NR_rename: /* Just oldname. TODO add newname */
+ case __NR_request_key: /* Just _type. TODO add _description */
+ case __NR_rmdir:
+ case __NR_setxattr: /* Just pathname, TODO add list */
+ case __NR_shmdt:
+ case __NR_statfs:
+ case __NR_swapon:
+ case __NR_swapoff:
+ case __NR_symlink: /* Just oldname. TODO add newname */
+ case __NR_truncate:
+ case __NR_unlink:
+ case __NR_umount2:
+ case __NR_utime:
+ case __NR_utimes:
+ sys_data->user_mask = BIT(0);
+ break;
+ /* user arg at position 1 */
+ case __NR_execveat:
+ case __NR_faccessat:
+ case __NR_faccessat2:
+ case __NR_finit_module:
+ case __NR_fchmodat:
+ case __NR_fchmodat2:
+ case __NR_fchownat:
+ case __NR_fgetxattr:
+ case __NR_flistxattr:
+ case __NR_fsetxattr:
+ case __NR_fspick:
+ case __NR_fremovexattr:
+ case __NR_futimesat:
+ case __NR_getxattrat: /* Just pathname, TODO add name */
+ case __NR_inotify_add_watch:
+ case __NR_linkat: /* Just oldname. TODO add newname */
+ case __NR_listxattrat: /* Just pathname, TODO add list */
+ case __NR_mkdirat:
+ case __NR_mknodat:
+ case __NR_mount_setattr:
+ case __NR_move_mount: /* Just from_pathname, TODO add to_pathname */
+ case __NR_name_to_handle_at:
+ case __NR_newfstatat:
+ case __NR_openat:
+ case __NR_openat2:
+ case __NR_open_tree:
+ case __NR_open_tree_attr:
+ case __NR_readlinkat:
+ case __NR_renameat: /* Just oldname. TODO add newname */
+ case __NR_renameat2: /* Just oldname. TODO add newname */
+ case __NR_removexattrat: /* Just pathname, TODO add name */
+ case __NR_quotactl:
+ case __NR_setxattrat: /* Just pathname, TODO add list */
+ case __NR_syslog:
+ case __NR_symlinkat: /* Just oldname. TODO add newname */
+ case __NR_statx:
+ case __NR_unlinkat:
+ case __NR_utimensat:
+ sys_data->user_mask = BIT(1);
+ break;
+ /* user arg at position 2 */
+ case __NR_init_module:
+ case __NR_fsconfig:
+ sys_data->user_mask = BIT(2);
+ break;
+ /* user arg at position 4 */
+ case __NR_fanotify_mark:
+ sys_data->user_mask = BIT(4);
+ break;
+ default:
+ sys_data->user_mask = 0;
+ }
+}
+
static int __init init_syscall_trace(struct trace_event_call *call)
{
int id;
@@ -471,6 +927,8 @@ static int __init init_syscall_trace(struct trace_event_call *call)
return -ENOSYS;
}
+ check_faultable_syscall(call, num);
+
if (set_syscall_print_fmt(call) < 0)
return -ENOMEM;
--
2.47.2
Hello, kernel test robot noticed "BUG:KASAN:slab-out-of-bounds_in_syscall_fault_buffer_enable" on: commit: 6bc850d6f8f7308a184edfd60ee1acdd89ced128 ("[PATCH 3/7] tracing: Have syscall trace events read user space string") url: https://github.com/intel-lab-lkp/linux/commits/Steven-Rostedt/tracing-Replace-syscall-RCU-pointer-assignment-with-READ-WRITE_ONCE/20250806-122312 base: https://git.kernel.org/cgit/linux/kernel/git/trace/linux-trace for-next patch link: https://lore.kernel.org/all/20250805193235.080757106@kernel.org/ patch subject: [PATCH 3/7] tracing: Have syscall trace events read user space string in testcase: boot config: x86_64-rhel-9.4-kunit compiler: gcc-12 test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G (please refer to attached dmesg/kmsg for entire log/backtrace) +---------------------------------------------------------------+------------+------------+ | | 63f89ba6a0 | 6bc850d6f8 | +---------------------------------------------------------------+------------+------------+ | BUG:KASAN:slab-out-of-bounds_in_syscall_fault_buffer_enable | 0 | 24 | +---------------------------------------------------------------+------------+------------+ If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <oliver.sang@intel.com> | Closes: https://lore.kernel.org/oe-lkp/202508110616.33657b6c-lkp@intel.com [ 47.226292][ T1] BUG: KASAN: slab-out-of-bounds in syscall_fault_buffer_enable (kernel/trace/trace_syscalls.c:430) [ 47.227603][ T1] Write of size 8 at addr ffff8881baea5f10 by task swapper/0/1 [ 47.228735][ T1] [ 47.229107][ T1] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.16.0-rc7-00138-g6bc850d6f8f7 #1 PREEMPT(voluntary) [ 47.229114][ T1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [ 47.229117][ T1] Call Trace: [ 47.229121][ T1] <TASK> [ 47.229124][ T1] dump_stack_lvl (lib/dump_stack.c:123 (discriminator 1)) [ 47.229134][ T1] print_address_description+0x2c/0x380 [ 47.229142][ T1] ? syscall_fault_buffer_enable (kernel/trace/trace_syscalls.c:430) [ 47.229146][ T1] print_report (mm/kasan/report.c:481) [ 47.229150][ T1] ? syscall_fault_buffer_enable (kernel/trace/trace_syscalls.c:430) [ 47.229153][ T1] ? kasan_addr_to_slab (mm/kasan/common.c:37) [ 47.229156][ T1] ? syscall_fault_buffer_enable (kernel/trace/trace_syscalls.c:430) [ 47.229159][ T1] kasan_report (mm/kasan/report.c:595) [ 47.229164][ T1] ? syscall_fault_buffer_enable (kernel/trace/trace_syscalls.c:430) [ 47.229167][ T1] syscall_fault_buffer_enable (kernel/trace/trace_syscalls.c:430) [ 47.229171][ T1] ? mutex_unlock (arch/x86/include/asm/atomic64_64.h:101 include/linux/atomic/atomic-arch-fallback.h:4329 include/linux/atomic/atomic-long.h:1506 include/linux/atomic/atomic-instrumented.h:4481 kernel/locking/mutex.c:167 kernel/locking/mutex.c:537) [ 47.229177][ T1] syscall_enter_register (kernel/trace/trace_syscalls.c:729 kernel/trace/trace_syscalls.c:1259) [ 47.229181][ T1] __ftrace_event_enable_disable (kernel/trace/trace_events.c:860) [ 47.229186][ T1] ? __pfx__printk (kernel/printk/printk.c:2470) [ 47.229192][ T1] __ftrace_set_clr_event_nolock (kernel/trace/trace_events.c:890 kernel/trace/trace_events.c:1353) [ 47.229197][ T1] event_trace_self_tests (kernel/trace/trace_events.c:1384 (discriminator 1) kernel/trace/trace_events.c:4779 (discriminator 1)) [ 47.229203][ T1] ? __pfx_event_trace_self_tests_init (kernel/trace/trace_events.c:4892) [ 47.229208][ T1] event_trace_self_tests_init (include/linux/list.h:373 kernel/trace/trace.h:487 kernel/trace/trace_events.c:4871 kernel/trace/trace_events.c:4894) [ 47.229212][ T1] do_one_initcall (init/main.c:1274) [ 47.229216][ T1] ? __pfx_do_one_initcall (init/main.c:1265) [ 47.229219][ T1] ? __pfx_parse_args (kernel/params.c:168) [ 47.229223][ T1] ? __kasan_kmalloc (include/linux/kfence.h:58 mm/kasan/common.c:390) [ 47.229227][ T1] ? do_initcalls (include/linux/slab.h:909 include/linux/slab.h:1039 init/main.c:1345) [ 47.229232][ T1] do_initcalls (init/main.c:1335 init/main.c:1352) [ 47.229236][ T1] kernel_init_freeable (init/main.c:1586) [ 47.229241][ T1] ? __pfx_kernel_init (init/main.c:1466) [ 47.229247][ T1] kernel_init (init/main.c:1476) [ 47.229251][ T1] ? calculate_sigpending (kernel/signal.c:194) [ 47.229256][ T1] ? __pfx_kernel_init (init/main.c:1466) [ 47.229259][ T1] ret_from_fork (arch/x86/kernel/process.c:154) [ 47.229265][ T1] ? __pfx_kernel_init (init/main.c:1466) [ 47.229269][ T1] ret_from_fork_asm (arch/x86/entry/entry_64.S:258) [ 47.229275][ T1] </TASK> [ 47.229276][ T1] [ 47.262431][ T1] Allocated by task 1: [ 47.263075][ T1] kasan_save_stack (mm/kasan/common.c:48) [ 47.263810][ T1] kasan_save_track (arch/x86/include/asm/current.h:25 mm/kasan/common.c:60 mm/kasan/common.c:69) [ 47.264530][ T1] __kasan_kmalloc (mm/kasan/common.c:377 mm/kasan/common.c:394) [ 47.265210][ T1] syscall_fault_buffer_enable (include/linux/slab.h:905 kernel/trace/trace_syscalls.c:426) [ 47.266034][ T1] syscall_enter_register (kernel/trace/trace_syscalls.c:729 kernel/trace/trace_syscalls.c:1259) [ 47.266825][ T1] __ftrace_event_enable_disable (kernel/trace/trace_events.c:860) [ 47.267743][ T1] __ftrace_set_clr_event_nolock (kernel/trace/trace_events.c:890 kernel/trace/trace_events.c:1353) [ 47.268609][ T1] event_trace_self_tests (kernel/trace/trace_events.c:1384 (discriminator 1) kernel/trace/trace_events.c:4779 (discriminator 1)) [ 47.269392][ T1] event_trace_self_tests_init (include/linux/list.h:373 kernel/trace/trace.h:487 kernel/trace/trace_events.c:4871 kernel/trace/trace_events.c:4894) [ 47.270229][ T1] do_one_initcall (init/main.c:1274) [ 47.270971][ T1] do_initcalls (init/main.c:1335 init/main.c:1352) [ 47.271669][ T1] kernel_init_freeable (init/main.c:1586) [ 47.272429][ T1] kernel_init (init/main.c:1476) [ 47.273127][ T1] ret_from_fork (arch/x86/kernel/process.c:154) [ 47.273817][ T1] ret_from_fork_asm (arch/x86/entry/entry_64.S:258) [ 47.274547][ T1] [ 47.274977][ T1] The buggy address belongs to the object at ffff8881baea5f00 [ 47.274977][ T1] which belongs to the cache kmalloc-8 of size 8 [ 47.277002][ T1] The buggy address is located 8 bytes to the right of [ 47.277002][ T1] allocated 8-byte region [ffff8881baea5f00, ffff8881baea5f08) [ 47.279110][ T1] [ 47.279551][ T1] The buggy address belongs to the physical page: [ 47.280447][ T1] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1baea5 [ 47.281773][ T1] flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff) [ 47.282819][ T1] page_type: f5(slab) [ 47.283449][ T1] raw: 0017ffffc0000000 ffff888100041500 dead000000000122 0000000000000000 [ 47.284709][ T1] raw: 0000000000000000 0000000080800080 00000000f5000000 0000000000000000 [ 47.286003][ T1] page dumped because: kasan: bad access detected [ 47.286854][ T1] page_owner tracks the page as allocated [ 47.287710][ T1] page last allocated via order 0, migratetype Unmovable, gfp_mask 0x52cc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP), pid 1, tgid 1 (swapper/0), ts 35802055637, free_ts 0 [ 47.290101][ T1] post_alloc_hook (include/linux/page_owner.h:32 mm/page_alloc.c:1704) [ 47.290799][ T1] get_page_from_freelist (mm/page_alloc.c:1714 mm/page_alloc.c:3669) [ 47.291594][ T1] __alloc_frozen_pages_noprof (mm/page_alloc.c:4959) [ 47.292391][ T1] alloc_pages_mpol (mm/mempolicy.c:2421) [ 47.293117][ T1] allocate_slab (mm/slub.c:2451 mm/slub.c:2619) [ 47.293768][ T1] ___slab_alloc (mm/slub.c:3859 (discriminator 3)) [ 47.294447][ T1] __kmalloc_node_track_caller_noprof (mm/slub.c:3949 mm/slub.c:4024 mm/slub.c:4185 mm/slub.c:4327 mm/slub.c:4347) [ 47.295322][ T1] kstrdup (mm/util.c:63 mm/util.c:83) [ 47.295898][ T1] __kernfs_new_node (fs/kernfs/dir.c:634) [ 47.296626][ T1] kernfs_new_node (fs/kernfs/dir.c:713) [ 47.297380][ T1] kernfs_create_dir_ns (fs/kernfs/dir.c:1085) [ 47.298159][ T1] sysfs_create_dir_ns (fs/sysfs/dir.c:61) [ 47.298933][ T1] kobject_add_internal (lib/kobject.c:73 lib/kobject.c:240) [ 47.299712][ T1] kobject_init_and_add (lib/kobject.c:374 lib/kobject.c:457) [ 47.300448][ T1] net_rx_queue_update_kobjects (net/core/net-sysfs.c:1239 net/core/net-sysfs.c:1301) [ 47.301292][ T1] netdev_register_kobject (net/core/net-sysfs.c:2093 net/core/net-sysfs.c:2340) [ 47.302015][ T1] page_owner free stack trace missing [ 47.302748][ T1] [ 47.303126][ T1] Memory state around the buggy address: [ 47.303914][ T1] ffff8881baea5e00: 06 fc fc fc 06 fc fc fc 04 fc fc fc 06 fc fc fc [ 47.305085][ T1] ffff8881baea5e80: 05 fc fc fc 05 fc fc fc 06 fc fc fc fc fc fc fc [ 47.306239][ T1] >ffff8881baea5f00: 00 fc fc fc 07 fc fc fc 00 fc fc fc fa fc fc fc [ 47.307378][ T1] ^ [ 47.308014][ T1] ffff8881baea5f80: fa fc fc fc fc fc fc fc 06 fc fc fc 06 fc fc fc [ 47.309220][ T1] ffff8881baea6000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 47.310929][ T1] ================================================================== [ 47.312186][ T1] Disabling lock debugging due to kernel taint [ 47.318329][ T1] OK [ 47.318896][ T1] Testing event system hyperv: OK The kernel config and materials to reproduce are available at: https://download.01.org/0day-ci/archive/20250811/202508110616.33657b6c-lkp@intel.com -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
Hi Steven, kernel test robot noticed the following build errors: [auto build test ERROR on trace/for-next] [also build test ERROR on linus/master v6.16 next-20250806] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Steven-Rostedt/tracing-Replace-syscall-RCU-pointer-assignment-with-READ-WRITE_ONCE/20250806-122312 base: https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace for-next patch link: https://lore.kernel.org/r/20250805193235.080757106%40kernel.org patch subject: [PATCH 3/7] tracing: Have syscall trace events read user space string config: parisc-randconfig-r071-20250806 (https://download.01.org/0day-ci/archive/20250806/202508062230.puMRaDdE-lkp@intel.com/config) compiler: hppa-linux-gcc (GCC) 11.5.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250806/202508062230.puMRaDdE-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202508062230.puMRaDdE-lkp@intel.com/ All errors (new ones prefixed by >>): kernel/trace/trace_syscalls.c: In function 'check_faultable_syscall': >> kernel/trace/trace_syscalls.c:886:14: error: '__NR_newfstatat' undeclared (first use in this function); did you mean 'sys_newfstatat'? 886 | case __NR_newfstatat: | ^~~~~~~~~~~~~~~ | sys_newfstatat kernel/trace/trace_syscalls.c:886:14: note: each undeclared identifier is reported only once for each function it appears in vim +886 kernel/trace/trace_syscalls.c 802 803 /* 804 * For system calls that reference user space memory that can 805 * be recorded into the event, set the system call meta data's user_mask 806 * to the "args" index that points to the user space memory to retrieve. 807 */ 808 static void check_faultable_syscall(struct trace_event_call *call, int nr) 809 { 810 struct syscall_metadata *sys_data = call->data; 811 812 /* Only work on entry */ 813 if (sys_data->enter_event != call) 814 return; 815 816 switch (nr) { 817 /* user arg at position 0 */ 818 case __NR_access: 819 case __NR_acct: 820 case __NR_add_key: /* Just _type. TODO add _description */ 821 case __NR_chdir: 822 case __NR_chown: 823 case __NR_chmod: 824 case __NR_chroot: 825 case __NR_creat: 826 case __NR_delete_module: 827 case __NR_execve: 828 case __NR_fsopen: 829 case __NR_getxattr: /* Just pathname, TODO add name */ 830 case __NR_lchown: 831 case __NR_lgetxattr: /* Just pathname, TODO add name */ 832 case __NR_lremovexattr: /* Just pathname, TODO add name */ 833 case __NR_link: /* Just oldname. TODO add newname */ 834 case __NR_listxattr: /* Just pathname, TODO add list */ 835 case __NR_llistxattr: /* Just pathname, TODO add list */ 836 case __NR_lsetxattr: /* Just pathname, TODO add list */ 837 case __NR_open: 838 case __NR_memfd_create: 839 case __NR_mount: /* Just dev_name, TODO add dir_name and type */ 840 case __NR_mkdir: 841 case __NR_mknod: 842 case __NR_mq_open: 843 case __NR_mq_unlink: 844 case __NR_pivot_root: /* Just new_root, TODO add old_root */ 845 case __NR_readlink: 846 case __NR_removexattr: /* Just pathname, TODO add name */ 847 case __NR_rename: /* Just oldname. TODO add newname */ 848 case __NR_request_key: /* Just _type. TODO add _description */ 849 case __NR_rmdir: 850 case __NR_setxattr: /* Just pathname, TODO add list */ 851 case __NR_shmdt: 852 case __NR_statfs: 853 case __NR_swapon: 854 case __NR_swapoff: 855 case __NR_symlink: /* Just oldname. TODO add newname */ 856 case __NR_truncate: 857 case __NR_unlink: 858 case __NR_umount2: 859 case __NR_utime: 860 case __NR_utimes: 861 sys_data->user_mask = BIT(0); 862 break; 863 /* user arg at position 1 */ 864 case __NR_execveat: 865 case __NR_faccessat: 866 case __NR_faccessat2: 867 case __NR_finit_module: 868 case __NR_fchmodat: 869 case __NR_fchmodat2: 870 case __NR_fchownat: 871 case __NR_fgetxattr: 872 case __NR_flistxattr: 873 case __NR_fsetxattr: 874 case __NR_fspick: 875 case __NR_fremovexattr: 876 case __NR_futimesat: 877 case __NR_getxattrat: /* Just pathname, TODO add name */ 878 case __NR_inotify_add_watch: 879 case __NR_linkat: /* Just oldname. TODO add newname */ 880 case __NR_listxattrat: /* Just pathname, TODO add list */ 881 case __NR_mkdirat: 882 case __NR_mknodat: 883 case __NR_mount_setattr: 884 case __NR_move_mount: /* Just from_pathname, TODO add to_pathname */ 885 case __NR_name_to_handle_at: > 886 case __NR_newfstatat: 887 case __NR_openat: 888 case __NR_openat2: 889 case __NR_open_tree: 890 case __NR_open_tree_attr: 891 case __NR_readlinkat: 892 case __NR_renameat: /* Just oldname. TODO add newname */ 893 case __NR_renameat2: /* Just oldname. TODO add newname */ 894 case __NR_removexattrat: /* Just pathname, TODO add name */ 895 case __NR_quotactl: 896 case __NR_setxattrat: /* Just pathname, TODO add list */ 897 case __NR_syslog: 898 case __NR_symlinkat: /* Just oldname. TODO add newname */ 899 case __NR_statx: 900 case __NR_unlinkat: 901 case __NR_utimensat: 902 sys_data->user_mask = BIT(1); 903 break; 904 /* user arg at position 2 */ 905 case __NR_init_module: 906 case __NR_fsconfig: 907 sys_data->user_mask = BIT(2); 908 break; 909 /* user arg at position 4 */ 910 case __NR_fanotify_mark: 911 sys_data->user_mask = BIT(4); 912 break; 913 default: 914 sys_data->user_mask = 0; 915 } 916 } 917 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
© 2016 - 2025 Red Hat, Inc.