From: Steven Rostedt <rostedt@goodmis.org>
Allow more than one field of a syscall trace event to read user space.
Build on top of the user_mask by allowing more than one bit to be set that
corresponds to the @args array of the syscall metadata. For each argument
in the @args array that is to be read, it will have a dynamic array/string
field associated to it.
Note that multiple fields to be read from user space is not supported if
the user_arg_size field is set in the syscall metada. That field can only
be used if only one field is being read from user space as that field is a
number representing the size field of the syscall event that holds the
size of the data to read from user space. It becomes ambiguous if the
system call reads more than one field. Currently this is not an issue.
If a syscall event happens to enable two events to read user space and
sets the user_arg_size field, it will trigger a warning at boot and the
user_arg_size field will be cleared.
The per CPU buffer that is used to read the user space addresses is now
broken up into 3 sections, each of 168 bytes. The reason for 168 is that
it is the biggest portion of 512 bytes divided by 3 that is 8 byte aligned.
The max amount copied into the ring buffer from user space is now only 128
bytes, which is plenty. When reading user space, it still reads 167
(168-1) bytes and uses the remaining to know if it should append the extra
"..." to the end or not.
This will allow the event to look like this:
sys_renameat2(olddfd: 0xffffff9c, oldname: 0x7ffe02facdff "/tmp/x", newdfd: 0xffffff9c, newname: 0x7ffe02face06 "/tmp/y", flags: 1)
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/trace_syscalls.c | 312 ++++++++++++++++++++++------------
1 file changed, 207 insertions(+), 105 deletions(-)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8c0142eea898..b39fa9dd1067 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -139,6 +139,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
struct syscall_metadata *entry;
int i, syscall, val, len;
unsigned char *ptr;
+ int offset = 0;
trace = (typeof(trace))ent;
syscall = trace->nr;
@@ -178,11 +179,12 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
continue;
/* This arg points to a user space string */
- ptr = (void *)trace->args + sizeof(long) * entry->nb_args;
+ ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset;
val = *(int *)ptr;
ptr = (void *)ent + (val & 0xffff);
len = val >> 16;
+ offset += 4;
if (entry->user_arg_size < 0 || entry->user_arg_is_str) {
trace_seq_printf(s, " \"%.*s\"", len, ptr);
@@ -335,7 +337,6 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
unsigned long mask;
char *arg;
int offset = offsetof(typeof(trace), args);
- int idx;
int ret = 0;
int len;
int i;
@@ -354,27 +355,35 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
return ret;
mask = meta->user_mask;
- idx = ffs(mask) - 1;
- /*
- * User space data is faulted into a temporary buffer and then
- * added as a dynamic string or array to the end of the event.
- * The user space data name for the arg pointer is "__<arg>_val".
- */
- len = strlen(meta->args[idx]) + sizeof("___val");
- arg = kmalloc(len, GFP_KERNEL);
- if (WARN_ON_ONCE(!arg)) {
- meta->user_mask = 0;
- return -ENOMEM;
- }
+ while (mask) {
+ int idx = ffs(mask) - 1;
+ mask &= ~BIT(idx);
+
+ /*
+ * User space data is faulted into a temporary buffer and then
+ * added as a dynamic string or array to the end of the event.
+ * The user space data name for the arg pointer is
+ * "__<arg>_val".
+ */
+ len = strlen(meta->args[idx]) + sizeof("___val");
+ arg = kmalloc(len, GFP_KERNEL);
+ if (WARN_ON_ONCE(!arg)) {
+ meta->user_mask = 0;
+ return -ENOMEM;
+ }
- snprintf(arg, len, "__%s_val", meta->args[idx]);
+ snprintf(arg, len, "__%s_val", meta->args[idx]);
- ret = trace_define_field(call, "__data_loc char[]",
- arg, offset, sizeof(int), 0,
- FILTER_OTHER);
- if (ret)
- kfree(arg);
+ ret = trace_define_field(call, "__data_loc char[]",
+ arg, offset, sizeof(int), 0,
+ FILTER_OTHER);
+ if (ret) {
+ kfree(arg);
+ break;
+ }
+ offset += 4;
+ }
return ret;
}
@@ -387,8 +396,25 @@ struct syscall_buf_info {
struct syscall_buf __percpu *sbuf;
};
-/* Create a per CPU temporary buffer to copy user space pointers into */
+/*
+ * Create a per CPU temporary buffer to copy user space pointers into.
+ *
+ * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use
+ * to copy memory from user space addresses into.
+ *
+ * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space.
+ *
+ * SYSCALL_FAULT_USER_MAX is the amount to copy into the ring buffer.
+ * It's slightly smaller than SYSCALL_FAULT_ARG_SZ to know if it
+ * needs to append the EXTRA or not.
+ *
+ * This only allows up to 3 args from system calls.
+ */
#define SYSCALL_FAULT_BUF_SZ 512
+#define SYSCALL_FAULT_ARG_SZ 168
+#define SYSCALL_FAULT_USER_MAX 128
+#define SYSCALL_FAULT_MAX_CNT 3
+
static struct syscall_buf_info *syscall_buffer;
static DEFINE_PER_CPU(unsigned long, sched_switch_cnt);
@@ -498,22 +524,57 @@ static void syscall_fault_buffer_disable(void)
call_rcu_tasks_trace(&sinfo->rcu, rcu_free_syscall_buffer);
}
-static char *sys_fault_user(struct syscall_metadata *sys_data, struct syscall_buf_info *sinfo,
- unsigned long *args, unsigned int *data_size)
+static char *sys_fault_user(struct syscall_metadata *sys_data,
+ struct syscall_buf_info *sinfo,
+ unsigned long *args,
+ unsigned int data_size[SYSCALL_FAULT_MAX_CNT])
{
- char *buf = per_cpu_ptr(sinfo->sbuf, smp_processor_id())->buf;
- unsigned long size = SYSCALL_FAULT_BUF_SZ - 1;
+ char *buffer = per_cpu_ptr(sinfo->sbuf, smp_processor_id())->buf;
unsigned long mask = sys_data->user_mask;
+ unsigned long size = SYSCALL_FAULT_ARG_SZ - 1;
unsigned int cnt;
- int idx = ffs(mask) - 1;
bool array = false;
- char *ptr;
+ char *ptr_array[SYSCALL_FAULT_MAX_CNT];
+ char *buf;
+ int read[SYSCALL_FAULT_MAX_CNT];
int trys = 0;
+ int uargs;
int ret;
+ int i = 0;
+
+ /* The extra is appended to the user data in the buffer */
+ BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >=
+ SYSCALL_FAULT_ARG_SZ);
+
+ /*
+ * If this system call event has a size argument, use
+ * it to define how much of user space memory to read,
+ * and read it as an array and not a string.
+ */
+ if (sys_data->user_arg_size >= 0) {
+ array = true;
+ size = args[sys_data->user_arg_size];
+ if (size > SYSCALL_FAULT_ARG_SZ - 1)
+ size = SYSCALL_FAULT_ARG_SZ - 1;
+ }
+
+ while (mask) {
+ int idx = ffs(mask) - 1;
+ mask &= ~BIT(idx);
+
+ if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT))
+ break;
+
+ /* Get the pointer to user space memory to read */
+ ptr_array[i++] = (char *)args[idx];
+ }
- /* Get the pointer to user space memory to read */
- ptr = (char *)args[idx];
- *data_size = 0;
+ uargs = i;
+
+ /* Clear the values that are not used */
+ for (; i < SYSCALL_FAULT_MAX_CNT; i++) {
+ data_size[i] = -1; /* Denotes no pointer */
+ }
again:
/*
@@ -532,24 +593,12 @@ static char *sys_fault_user(struct syscall_metadata *sys_data, struct syscall_bu
if (!once)
pr_warn("Error: Too many tries to read syscall %s\n", sys_data->name);
once = true;
- return buf;
+ return buffer;
}
/* Read the current sched switch count */
cnt = this_cpu_read(sched_switch_cnt);
- /*
- * If this system call event has a size argument, use
- * it to define how much of user space memory to read,
- * and read it as an array and not a string.
- */
- if (sys_data->user_arg_size >= 0) {
- array = true;
- size = args[sys_data->user_arg_size];
- if (size > SYSCALL_FAULT_BUF_SZ - 1)
- size = SYSCALL_FAULT_BUF_SZ - 1;
- }
-
/*
* Preemption is going to be enabled, but this task must
* remain on this CPU.
@@ -562,20 +611,23 @@ static char *sys_fault_user(struct syscall_metadata *sys_data, struct syscall_bu
*/
preempt_enable_notrace();
- if (array) {
- ret = __copy_from_user(buf, ptr, size);
- ret = ret ? -1 : size;
- } else {
- ret = strncpy_from_user(buf, ptr, size);
+ buf = buffer;
+
+ for (i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
+ char *ptr = ptr_array[i];
+
+ if (array) {
+ ret = __copy_from_user(buf, ptr, size);
+ ret = ret ? -1 : size;
+ } else {
+ ret = strncpy_from_user(buf, ptr, size);
+ }
+ read[i] = ret;
}
preempt_disable_notrace();
migrate_enable();
- /* If it faulted, no use to try again */
- if (ret < 0)
- return buf;
-
/*
* Preemption is disabled again, now check the sched_switch_cnt.
* If it increased by two or more, then another user space process
@@ -592,28 +644,39 @@ static char *sys_fault_user(struct syscall_metadata *sys_data, struct syscall_bu
if (this_cpu_read(sched_switch_cnt) > cnt + 1)
goto again;
- /* For strings, replace any non-printable characters with '.' */
- if (!array) {
- for (int i = 0; i < ret; i++) {
- if (!isprint(buf[i]))
- buf[i] = '.';
- }
+ buf = buffer;
+ for (i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
- /*
- * If the text was truncated due to our max limit, add "..." to
- * the string.
- */
- if (ret > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) {
- strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA),
- EXTRA, sizeof(EXTRA));
- ret = SYSCALL_FAULT_BUF_SZ;
+ ret = read[i];
+ if (ret < 0)
+ continue;
+ buf[ret] = '\0';
+
+ /* For strings, replace any non-printable characters with '.' */
+ if (!array) {
+ for (int x = 0; x < ret; x++) {
+ if (!isprint(buf[x]))
+ buf[x] = '.';
+ }
+
+ /*
+ * If the text was truncated due to our max limit,
+ * add "..." to the string.
+ */
+ if (ret > SYSCALL_FAULT_USER_MAX) {
+ strscpy(buf + SYSCALL_FAULT_USER_MAX, EXTRA,
+ sizeof(EXTRA));
+ ret = SYSCALL_FAULT_USER_MAX + sizeof(EXTRA);
+ } else {
+ buf[ret++] = '\0';
+ }
} else {
- buf[ret++] = '\0';
+ ret = min(ret, SYSCALL_FAULT_USER_MAX);
}
+ data_size[i] = ret;
}
- *data_size = ret;
- return buf;
+ return buffer;
}
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
@@ -625,9 +688,10 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct trace_event_buffer fbuffer;
unsigned long args[6];
char *user_ptr;
- int user_size = 0;
+ int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
int syscall_nr;
int size = 0;
+ int uargs = 0;
bool mayfault;
/*
@@ -660,20 +724,27 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
if (mayfault) {
struct syscall_buf_info *sinfo;
+ int i;
/* If the syscall_buffer is NULL, tracing is being shutdown */
sinfo = READ_ONCE(syscall_buffer);
if (!sinfo)
return;
- user_ptr = sys_fault_user(sys_data, sinfo, args, &user_size);
+ user_ptr = sys_fault_user(sys_data, sinfo, args, user_sizes);
/*
* user_size is the amount of data to append.
* Need to add 4 for the meta field that points to
* the user memory at the end of the event and also
* stores its size.
*/
- size = 4 + user_size;
+ for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) {
+ if (user_sizes[i] < 0)
+ break;
+ size += user_sizes[i] + 4;
+ }
+ /* Save the number of user read arguments of this syscall */
+ uargs = i;
}
size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
@@ -688,6 +759,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
if (mayfault) {
+ char *buf = user_ptr;
void *ptr;
int val;
@@ -699,21 +771,30 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
/*
* The meta data will store the offset of the user data from
- * the beginning of the event.
+ * the beginning of the event. That is after the static arguments
+ * and the meta data fields.
*/
- val = (ptr - (void *)entry) + 4;
+ val = (ptr - (void *)entry) + 4 * uargs;
+
+ for (int i = 0; i < uargs; i++) {
- /* Store the offset and the size into the meta data */
- *(int *)ptr = val | (user_size << 16);
+ if (i)
+ val += user_sizes[i - 1];
- if (WARN_ON_ONCE((ptr - (void *)entry + user_size) > size))
- user_size = 0;
+ /* Store the offset and the size into the meta data */
+ *(int *)ptr = val | (user_sizes[i] << 16);
- /* Nothing to do if the user space was empty or faulted */
- if (user_size) {
- /* Now store the user space data into the event */
+ /* Skip the meta data */
ptr += 4;
- memcpy(ptr, user_ptr, user_size);
+ }
+
+ for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
+ /* Nothing to do if the user space was empty or faulted */
+ if (!user_sizes[i])
+ continue;
+
+ memcpy(ptr, buf, user_sizes[i]);
+ ptr += user_sizes[i];
}
}
@@ -857,6 +938,7 @@ static void unreg_event_syscall_exit(struct trace_event_file *file,
static void check_faultable_syscall(struct trace_event_call *call, int nr)
{
struct syscall_metadata *sys_data = call->data;
+ unsigned long mask;
/* Only work on entry */
if (sys_data->enter_event != call)
@@ -888,7 +970,6 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr)
/* user arg at position 0 */
case __NR_access:
case __NR_acct:
- case __NR_add_key: /* Just _type. TODO add _description */
case __NR_chdir:
case __NR_chown:
case __NR_chmod:
@@ -897,28 +978,15 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr)
case __NR_delete_module:
case __NR_execve:
case __NR_fsopen:
- case __NR_getxattr: /* Just pathname, TODO add name */
case __NR_lchown:
- case __NR_lgetxattr: /* Just pathname, TODO add name */
- case __NR_lremovexattr: /* Just pathname, TODO add name */
- case __NR_link: /* Just oldname. TODO add newname */
- case __NR_listxattr: /* Just pathname, TODO add list */
- case __NR_llistxattr: /* Just pathname, TODO add list */
- case __NR_lsetxattr: /* Just pathname, TODO add list */
case __NR_open:
case __NR_memfd_create:
- case __NR_mount: /* Just dev_name, TODO add dir_name and type */
case __NR_mkdir:
case __NR_mknod:
case __NR_mq_open:
case __NR_mq_unlink:
- case __NR_pivot_root: /* Just new_root, TODO add old_root */
case __NR_readlink:
- case __NR_removexattr: /* Just pathname, TODO add name */
- case __NR_rename: /* Just oldname. TODO add newname */
- case __NR_request_key: /* Just _type. TODO add _description */
case __NR_rmdir:
- case __NR_setxattr: /* Just pathname, TODO add list */
case __NR_shmdt:
case __NR_statfs:
case __NR_swapon:
@@ -945,14 +1013,10 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr)
case __NR_fspick:
case __NR_fremovexattr:
case __NR_futimesat:
- case __NR_getxattrat: /* Just pathname, TODO add name */
case __NR_inotify_add_watch:
- case __NR_linkat: /* Just oldname. TODO add newname */
- case __NR_listxattrat: /* Just pathname, TODO add list */
case __NR_mkdirat:
case __NR_mknodat:
case __NR_mount_setattr:
- case __NR_move_mount: /* Just from_pathname, TODO add to_pathname */
case __NR_name_to_handle_at:
case __NR_newfstatat:
case __NR_openat:
@@ -960,13 +1024,8 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr)
case __NR_open_tree:
case __NR_open_tree_attr:
case __NR_readlinkat:
- case __NR_renameat: /* Just oldname. TODO add newname */
- case __NR_renameat2: /* Just oldname. TODO add newname */
- case __NR_removexattrat: /* Just pathname, TODO add name */
case __NR_quotactl:
- case __NR_setxattrat: /* Just pathname, TODO add list */
case __NR_syslog:
- case __NR_symlinkat: /* Just oldname. TODO add newname */
case __NR_statx:
case __NR_unlinkat:
case __NR_utimensat:
@@ -981,9 +1040,52 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr)
case __NR_fanotify_mark:
sys_data->user_mask = BIT(4);
break;
+ /* 2 user args, 0 and 1 */
+ case __NR_add_key:
+ case __NR_getxattr:
+ case __NR_lgetxattr:
+ case __NR_lremovexattr:
+ case __NR_link:
+ case __NR_listxattr:
+ case __NR_llistxattr:
+ case __NR_lsetxattr:
+ case __NR_pivot_root:
+ case __NR_removexattr:
+ case __NR_rename:
+ case __NR_request_key:
+ case __NR_setxattr:
+ case __NR_symlinkat:
+ sys_data->user_mask = BIT(0) | BIT(1);
+ break;
+ /* 2 user args, 1 and 3 */
+ case __NR_getxattrat:
+ case __NR_linkat:
+ case __NR_listxattrat:
+ case __NR_move_mount:
+ case __NR_renameat:
+ case __NR_renameat2:
+ case __NR_removexattrat:
+ case __NR_setxattrat:
+ sys_data->user_mask = BIT(1) | BIT(3);
+ break;
+ case __NR_mount: /* Just dev_name and dir_name, TODO add type */
+ sys_data->user_mask = BIT(0) | BIT(1) | BIT(2);
+ break;
default:
sys_data->user_mask = 0;
+ return;
}
+
+ if (sys_data->user_arg_size < 0)
+ return;
+
+ /*
+ * The user_arg_size can only be used when the system call
+ * is reading only a single address from user space.
+ */
+ mask = sys_data->user_mask;
+ if (WARN_ON(mask & (mask - 1)))
+ sys_data->user_arg_size = -1;
}
static int __init init_syscall_trace(struct trace_event_call *call)
--
2.47.2
Hi Steven, kernel test robot noticed the following build warnings: [auto build test WARNING on trace/for-next] [also build test WARNING on linus/master v6.16 next-20250806] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Steven-Rostedt/tracing-Replace-syscall-RCU-pointer-assignment-with-READ-WRITE_ONCE/20250806-122312 base: https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace for-next patch link: https://lore.kernel.org/r/20250805193235.582013098%40kernel.org patch subject: [PATCH 6/7] tracing: Allow syscall trace events to read more than one user parameter config: x86_64-randconfig-123-20250806 (https://download.01.org/0day-ci/archive/20250807/202508070706.TiTQY0Ne-lkp@intel.com/config) compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250807/202508070706.TiTQY0Ne-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202508070706.TiTQY0Ne-lkp@intel.com/ sparse warnings: (new ones prefixed by >>) >> kernel/trace/trace_syscalls.c:620:53: sparse: sparse: incorrect type in argument 2 (different address spaces) @@ expected void const [noderef] __user *from @@ got char *ptr @@ kernel/trace/trace_syscalls.c:620:53: sparse: expected void const [noderef] __user *from kernel/trace/trace_syscalls.c:620:53: sparse: got char *ptr >> kernel/trace/trace_syscalls.c:623:54: sparse: sparse: incorrect type in argument 2 (different address spaces) @@ expected char const [noderef] __user *src @@ got char *ptr @@ kernel/trace/trace_syscalls.c:623:54: sparse: expected char const [noderef] __user *src kernel/trace/trace_syscalls.c:623:54: sparse: got char *ptr kernel/trace/trace_syscalls.c:707:20: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct trace_event_file *trace_file @@ got struct trace_event_file [noderef] __rcu * @@ kernel/trace/trace_syscalls.c:707:20: sparse: expected struct trace_event_file *trace_file kernel/trace/trace_syscalls.c:707:20: sparse: got struct trace_event_file [noderef] __rcu * kernel/trace/trace_syscalls.c:824:20: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct trace_event_file *trace_file @@ got struct trace_event_file [noderef] __rcu * @@ kernel/trace/trace_syscalls.c:824:20: sparse: expected struct trace_event_file *trace_file kernel/trace/trace_syscalls.c:824:20: sparse: got struct trace_event_file [noderef] __rcu * kernel/trace/trace_syscalls.c:871:9: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct trace_event_file [noderef] __rcu *volatile @@ got struct trace_event_file *file @@ kernel/trace/trace_syscalls.c:871:9: sparse: expected struct trace_event_file [noderef] __rcu *volatile kernel/trace/trace_syscalls.c:871:9: sparse: got struct trace_event_file *file kernel/trace/trace_syscalls.c:909:17: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct trace_event_file [noderef] __rcu *volatile @@ got struct trace_event_file *file @@ kernel/trace/trace_syscalls.c:909:17: sparse: expected struct trace_event_file [noderef] __rcu *volatile kernel/trace/trace_syscalls.c:909:17: sparse: got struct trace_event_file *file vim +620 kernel/trace/trace_syscalls.c 6bc850d6f8f730 Steven Rostedt 2025-08-05 526 623bd9e046f95c Steven Rostedt 2025-08-05 527 static char *sys_fault_user(struct syscall_metadata *sys_data, 623bd9e046f95c Steven Rostedt 2025-08-05 528 struct syscall_buf_info *sinfo, 623bd9e046f95c Steven Rostedt 2025-08-05 529 unsigned long *args, 623bd9e046f95c Steven Rostedt 2025-08-05 530 unsigned int data_size[SYSCALL_FAULT_MAX_CNT]) 6bc850d6f8f730 Steven Rostedt 2025-08-05 531 { 623bd9e046f95c Steven Rostedt 2025-08-05 532 char *buffer = per_cpu_ptr(sinfo->sbuf, smp_processor_id())->buf; 6bc850d6f8f730 Steven Rostedt 2025-08-05 533 unsigned long mask = sys_data->user_mask; 623bd9e046f95c Steven Rostedt 2025-08-05 534 unsigned long size = SYSCALL_FAULT_ARG_SZ - 1; 6bc850d6f8f730 Steven Rostedt 2025-08-05 535 unsigned int cnt; b979d33ec48bbd Steven Rostedt 2025-08-05 536 bool array = false; 623bd9e046f95c Steven Rostedt 2025-08-05 537 char *ptr_array[SYSCALL_FAULT_MAX_CNT]; 623bd9e046f95c Steven Rostedt 2025-08-05 538 char *buf; 623bd9e046f95c Steven Rostedt 2025-08-05 539 int read[SYSCALL_FAULT_MAX_CNT]; 6bc850d6f8f730 Steven Rostedt 2025-08-05 540 int trys = 0; 623bd9e046f95c Steven Rostedt 2025-08-05 541 int uargs; 6bc850d6f8f730 Steven Rostedt 2025-08-05 542 int ret; 623bd9e046f95c Steven Rostedt 2025-08-05 543 int i = 0; 623bd9e046f95c Steven Rostedt 2025-08-05 544 623bd9e046f95c Steven Rostedt 2025-08-05 545 /* The extra is appended to the user data in the buffer */ 623bd9e046f95c Steven Rostedt 2025-08-05 546 BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >= 623bd9e046f95c Steven Rostedt 2025-08-05 547 SYSCALL_FAULT_ARG_SZ); 623bd9e046f95c Steven Rostedt 2025-08-05 548 623bd9e046f95c Steven Rostedt 2025-08-05 549 /* 623bd9e046f95c Steven Rostedt 2025-08-05 550 * If this system call event has a size argument, use 623bd9e046f95c Steven Rostedt 2025-08-05 551 * it to define how much of user space memory to read, 623bd9e046f95c Steven Rostedt 2025-08-05 552 * and read it as an array and not a string. 623bd9e046f95c Steven Rostedt 2025-08-05 553 */ 623bd9e046f95c Steven Rostedt 2025-08-05 554 if (sys_data->user_arg_size >= 0) { 623bd9e046f95c Steven Rostedt 2025-08-05 555 array = true; 623bd9e046f95c Steven Rostedt 2025-08-05 556 size = args[sys_data->user_arg_size]; 623bd9e046f95c Steven Rostedt 2025-08-05 557 if (size > SYSCALL_FAULT_ARG_SZ - 1) 623bd9e046f95c Steven Rostedt 2025-08-05 558 size = SYSCALL_FAULT_ARG_SZ - 1; 623bd9e046f95c Steven Rostedt 2025-08-05 559 } 623bd9e046f95c Steven Rostedt 2025-08-05 560 623bd9e046f95c Steven Rostedt 2025-08-05 561 while (mask) { 623bd9e046f95c Steven Rostedt 2025-08-05 562 int idx = ffs(mask) - 1; 623bd9e046f95c Steven Rostedt 2025-08-05 563 mask &= ~BIT(idx); 623bd9e046f95c Steven Rostedt 2025-08-05 564 623bd9e046f95c Steven Rostedt 2025-08-05 565 if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT)) 623bd9e046f95c Steven Rostedt 2025-08-05 566 break; 6bc850d6f8f730 Steven Rostedt 2025-08-05 567 6bc850d6f8f730 Steven Rostedt 2025-08-05 568 /* Get the pointer to user space memory to read */ 623bd9e046f95c Steven Rostedt 2025-08-05 569 ptr_array[i++] = (char *)args[idx]; 623bd9e046f95c Steven Rostedt 2025-08-05 570 } 623bd9e046f95c Steven Rostedt 2025-08-05 571 623bd9e046f95c Steven Rostedt 2025-08-05 572 uargs = i; 623bd9e046f95c Steven Rostedt 2025-08-05 573 623bd9e046f95c Steven Rostedt 2025-08-05 574 /* Clear the values that are not used */ 623bd9e046f95c Steven Rostedt 2025-08-05 575 for (; i < SYSCALL_FAULT_MAX_CNT; i++) { 623bd9e046f95c Steven Rostedt 2025-08-05 576 data_size[i] = -1; /* Denotes no pointer */ 623bd9e046f95c Steven Rostedt 2025-08-05 577 } 6bc850d6f8f730 Steven Rostedt 2025-08-05 578 6bc850d6f8f730 Steven Rostedt 2025-08-05 579 again: 6bc850d6f8f730 Steven Rostedt 2025-08-05 580 /* 6bc850d6f8f730 Steven Rostedt 2025-08-05 581 * If this task is preempted by another user space task, it 6bc850d6f8f730 Steven Rostedt 2025-08-05 582 * will cause this task to try again. But just in case something 6bc850d6f8f730 Steven Rostedt 2025-08-05 583 * changes where the copying from user space causes another task 6bc850d6f8f730 Steven Rostedt 2025-08-05 584 * to run, prevent this from going into an infinite loop. 6bc850d6f8f730 Steven Rostedt 2025-08-05 585 * 10 tries should be plenty. 6bc850d6f8f730 Steven Rostedt 2025-08-05 586 */ 6bc850d6f8f730 Steven Rostedt 2025-08-05 587 if (trys++ > 10) { 6bc850d6f8f730 Steven Rostedt 2025-08-05 588 static bool once; 6bc850d6f8f730 Steven Rostedt 2025-08-05 589 /* 6bc850d6f8f730 Steven Rostedt 2025-08-05 590 * Only print a message instead of a WARN_ON() as this could 6bc850d6f8f730 Steven Rostedt 2025-08-05 591 * theoretically trigger under real load. 6bc850d6f8f730 Steven Rostedt 2025-08-05 592 */ 6bc850d6f8f730 Steven Rostedt 2025-08-05 593 if (!once) 6bc850d6f8f730 Steven Rostedt 2025-08-05 594 pr_warn("Error: Too many tries to read syscall %s\n", sys_data->name); 6bc850d6f8f730 Steven Rostedt 2025-08-05 595 once = true; 623bd9e046f95c Steven Rostedt 2025-08-05 596 return buffer; 6bc850d6f8f730 Steven Rostedt 2025-08-05 597 } 6bc850d6f8f730 Steven Rostedt 2025-08-05 598 6bc850d6f8f730 Steven Rostedt 2025-08-05 599 /* Read the current sched switch count */ 6bc850d6f8f730 Steven Rostedt 2025-08-05 600 cnt = this_cpu_read(sched_switch_cnt); 6bc850d6f8f730 Steven Rostedt 2025-08-05 601 6bc850d6f8f730 Steven Rostedt 2025-08-05 602 /* 6bc850d6f8f730 Steven Rostedt 2025-08-05 603 * Preemption is going to be enabled, but this task must 6bc850d6f8f730 Steven Rostedt 2025-08-05 604 * remain on this CPU. 6bc850d6f8f730 Steven Rostedt 2025-08-05 605 */ 6bc850d6f8f730 Steven Rostedt 2025-08-05 606 migrate_disable(); 6bc850d6f8f730 Steven Rostedt 2025-08-05 607 6bc850d6f8f730 Steven Rostedt 2025-08-05 608 /* 6bc850d6f8f730 Steven Rostedt 2025-08-05 609 * Now preemption is being enabed and another task can come in 6bc850d6f8f730 Steven Rostedt 2025-08-05 610 * and use the same buffer and corrupt our data. 6bc850d6f8f730 Steven Rostedt 2025-08-05 611 */ 6bc850d6f8f730 Steven Rostedt 2025-08-05 612 preempt_enable_notrace(); 6bc850d6f8f730 Steven Rostedt 2025-08-05 613 623bd9e046f95c Steven Rostedt 2025-08-05 614 buf = buffer; 623bd9e046f95c Steven Rostedt 2025-08-05 615 623bd9e046f95c Steven Rostedt 2025-08-05 616 for (i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 623bd9e046f95c Steven Rostedt 2025-08-05 617 char *ptr = ptr_array[i]; 623bd9e046f95c Steven Rostedt 2025-08-05 618 b979d33ec48bbd Steven Rostedt 2025-08-05 619 if (array) { b979d33ec48bbd Steven Rostedt 2025-08-05 @620 ret = __copy_from_user(buf, ptr, size); b979d33ec48bbd Steven Rostedt 2025-08-05 621 ret = ret ? -1 : size; b979d33ec48bbd Steven Rostedt 2025-08-05 622 } else { 6bc850d6f8f730 Steven Rostedt 2025-08-05 @623 ret = strncpy_from_user(buf, ptr, size); b979d33ec48bbd Steven Rostedt 2025-08-05 624 } 623bd9e046f95c Steven Rostedt 2025-08-05 625 read[i] = ret; 623bd9e046f95c Steven Rostedt 2025-08-05 626 } 6bc850d6f8f730 Steven Rostedt 2025-08-05 627 6bc850d6f8f730 Steven Rostedt 2025-08-05 628 preempt_disable_notrace(); 6bc850d6f8f730 Steven Rostedt 2025-08-05 629 migrate_enable(); 6bc850d6f8f730 Steven Rostedt 2025-08-05 630 6bc850d6f8f730 Steven Rostedt 2025-08-05 631 /* 6bc850d6f8f730 Steven Rostedt 2025-08-05 632 * Preemption is disabled again, now check the sched_switch_cnt. 6bc850d6f8f730 Steven Rostedt 2025-08-05 633 * If it increased by two or more, then another user space process 6bc850d6f8f730 Steven Rostedt 2025-08-05 634 * may have schedule in and corrupted our buffer. In that case 6bc850d6f8f730 Steven Rostedt 2025-08-05 635 * the copying must be retried. 6bc850d6f8f730 Steven Rostedt 2025-08-05 636 * 6bc850d6f8f730 Steven Rostedt 2025-08-05 637 * Note, if this task was scheduled out and only kernel threads 6bc850d6f8f730 Steven Rostedt 2025-08-05 638 * were scheduled in (maybe to process the fault), then the 6bc850d6f8f730 Steven Rostedt 2025-08-05 639 * counter would increment again when this task scheduled in. 6bc850d6f8f730 Steven Rostedt 2025-08-05 640 * If this task scheduled out and another user task scheduled 6bc850d6f8f730 Steven Rostedt 2025-08-05 641 * in, this task would still need to be scheduled back in and 6bc850d6f8f730 Steven Rostedt 2025-08-05 642 * the counter would increment by at least two. 6bc850d6f8f730 Steven Rostedt 2025-08-05 643 */ 6bc850d6f8f730 Steven Rostedt 2025-08-05 644 if (this_cpu_read(sched_switch_cnt) > cnt + 1) 6bc850d6f8f730 Steven Rostedt 2025-08-05 645 goto again; 6bc850d6f8f730 Steven Rostedt 2025-08-05 646 623bd9e046f95c Steven Rostedt 2025-08-05 647 buf = buffer; 623bd9e046f95c Steven Rostedt 2025-08-05 648 for (i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 623bd9e046f95c Steven Rostedt 2025-08-05 649 623bd9e046f95c Steven Rostedt 2025-08-05 650 ret = read[i]; 623bd9e046f95c Steven Rostedt 2025-08-05 651 if (ret < 0) 623bd9e046f95c Steven Rostedt 2025-08-05 652 continue; 623bd9e046f95c Steven Rostedt 2025-08-05 653 buf[ret] = '\0'; 623bd9e046f95c Steven Rostedt 2025-08-05 654 b979d33ec48bbd Steven Rostedt 2025-08-05 655 /* For strings, replace any non-printable characters with '.' */ b979d33ec48bbd Steven Rostedt 2025-08-05 656 if (!array) { 623bd9e046f95c Steven Rostedt 2025-08-05 657 for (int x = 0; x < ret; x++) { 623bd9e046f95c Steven Rostedt 2025-08-05 658 if (!isprint(buf[x])) 623bd9e046f95c Steven Rostedt 2025-08-05 659 buf[x] = '.'; 6bc850d6f8f730 Steven Rostedt 2025-08-05 660 } 6bc850d6f8f730 Steven Rostedt 2025-08-05 661 6bc850d6f8f730 Steven Rostedt 2025-08-05 662 /* 623bd9e046f95c Steven Rostedt 2025-08-05 663 * If the text was truncated due to our max limit, 623bd9e046f95c Steven Rostedt 2025-08-05 664 * add "..." to the string. 6bc850d6f8f730 Steven Rostedt 2025-08-05 665 */ 623bd9e046f95c Steven Rostedt 2025-08-05 666 if (ret > SYSCALL_FAULT_USER_MAX) { 623bd9e046f95c Steven Rostedt 2025-08-05 667 strscpy(buf + SYSCALL_FAULT_USER_MAX, EXTRA, 623bd9e046f95c Steven Rostedt 2025-08-05 668 sizeof(EXTRA)); 623bd9e046f95c Steven Rostedt 2025-08-05 669 ret = SYSCALL_FAULT_USER_MAX + sizeof(EXTRA); 6bc850d6f8f730 Steven Rostedt 2025-08-05 670 } else { 6bc850d6f8f730 Steven Rostedt 2025-08-05 671 buf[ret++] = '\0'; 6bc850d6f8f730 Steven Rostedt 2025-08-05 672 } 623bd9e046f95c Steven Rostedt 2025-08-05 673 } else { 623bd9e046f95c Steven Rostedt 2025-08-05 674 ret = min(ret, SYSCALL_FAULT_USER_MAX); 623bd9e046f95c Steven Rostedt 2025-08-05 675 } 623bd9e046f95c Steven Rostedt 2025-08-05 676 data_size[i] = ret; b979d33ec48bbd Steven Rostedt 2025-08-05 677 } 6bc850d6f8f730 Steven Rostedt 2025-08-05 678 623bd9e046f95c Steven Rostedt 2025-08-05 679 return buffer; 6bc850d6f8f730 Steven Rostedt 2025-08-05 680 } 6bc850d6f8f730 Steven Rostedt 2025-08-05 681 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
© 2016 - 2025 Red Hat, Inc.