[v1] tracing: Show contents of syscall trace event user space fields

[PATCH 7/7] tracing: Add syscall_user_buf_size to limit amount written

Posted by Steven Rostedt 2 months ago

From: Steven Rostedt <rostedt@goodmis.org>

When a system call that reads user space addresses copy it to the ring
buffer, it can copy up to 511 bytes of data. This can waste precious ring
buffer space if the user isn't interested in the output. Add a new file
"syscall_user_buf_size" that gets initialized to a new config
CONFIG_SYSCALL_BUF_SIZE_DEFAULT that defaults to 128.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.rst |  7 +++++
 kernel/trace/Kconfig           | 13 +++++++++
 kernel/trace/trace.c           | 52 ++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h           |  3 ++
 kernel/trace/trace_syscalls.c  | 22 ++++++++------
 5 files changed, 88 insertions(+), 9 deletions(-)

diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index af66a05e18cc..4712bbfcfd08 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -366,6 +366,13 @@ of ftrace. Here is a list of some of the key files:
 	for each function. The displayed address is the patch-site address
 	and can differ from /proc/kallsyms address.
 
+  syscall_user_buf_size:
+
+	Some system call trace events will record the data from a user
+	space address that one of the parameters point to. The amount of
+	data per event is limited. This file holds the max number of bytes
+	that will be recorded into the ring buffer to hold this data.
+
   dyn_ftrace_total_info:
 
 	This file is for debugging purposes. The number of functions that
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f80298e6aa16..aa28d7ca3e31 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -574,6 +574,19 @@ config FTRACE_SYSCALLS
 	help
 	  Basic tracer to catch the syscall entry and exit events.
 
+config TRACE_SYSCALL_BUF_SIZE_DEFAULT
+	int "System call user read max size"
+	range 0 128
+	default 128
+	depends on FTRACE_SYSCALLS
+	help
+	 Some system call trace events will record the data from a user
+	 space address that one of the parameters point to. The amount of
+	 data per event is limited. It may be further limited by this
+	 config and later changed by writing an ASCII number into:
+
+	   /sys/kernel/tracing/syscall_user_buf_size
+
 config TRACER_SNAPSHOT
 	bool "Create a snapshot trace buffer"
 	select TRACER_MAX_TRACE
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d0b1964648c1..1db708ed0625 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6913,6 +6913,43 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	goto out;
 }
 
+static ssize_t
+tracing_syscall_buf_read(struct file *filp, char __user *ubuf,
+			 size_t cnt, loff_t *ppos)
+{
+	struct inode *inode = file_inode(filp);
+	struct trace_array *tr = inode->i_private;
+	char buf[64];
+	int r;
+
+	r = snprintf(buf, 64, "%d\n", tr->syscall_buf_sz);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_syscall_buf_write(struct file *filp, const char __user *ubuf,
+			  size_t cnt, loff_t *ppos)
+{
+	struct inode *inode = file_inode(filp);
+	struct trace_array *tr = inode->i_private;
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+	if (ret)
+		return ret;
+
+	if (val > SYSCALL_FAULT_USER_MAX)
+		val = SYSCALL_FAULT_USER_MAX;
+
+	tr->syscall_buf_sz = val;
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
 static ssize_t
 tracing_entries_read(struct file *filp, char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
@@ -7737,6 +7774,14 @@ static const struct file_operations tracing_entries_fops = {
 	.release	= tracing_release_generic_tr,
 };
 
+static const struct file_operations tracing_syscall_buf_fops = {
+	.open		= tracing_open_generic_tr,
+	.read		= tracing_syscall_buf_read,
+	.write		= tracing_syscall_buf_write,
+	.llseek		= generic_file_llseek,
+	.release	= tracing_release_generic_tr,
+};
+
 static const struct file_operations tracing_buffer_meta_fops = {
 	.open		= tracing_buffer_meta_open,
 	.read		= seq_read,
@@ -9839,6 +9884,8 @@ trace_array_create_systems(const char *name, const char *systems,
 
 	raw_spin_lock_init(&tr->start_lock);
 
+	tr->syscall_buf_sz = global_trace.syscall_buf_sz;
+
 	tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 #ifdef CONFIG_TRACER_MAX_TRACE
 	spin_lock_init(&tr->snapshot_trigger_lock);
@@ -10155,6 +10202,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
 			  tr, &buffer_subbuf_size_fops);
 
+	trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer,
+			 tr, &tracing_syscall_buf_fops);
+
 	create_trace_options_dir(tr);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
@@ -11081,6 +11131,8 @@ __init static int tracer_alloc_buffers(void)
 
 	global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
 
+	global_trace.syscall_buf_sz = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
+
 	INIT_LIST_HEAD(&global_trace.systems);
 	INIT_LIST_HEAD(&global_trace.events);
 	INIT_LIST_HEAD(&global_trace.hist_vars);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1dbf1d3cf2f1..1b3e464619f0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -131,6 +131,8 @@ enum trace_type {
 #define HIST_STACKTRACE_SIZE	(HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
 #define HIST_STACKTRACE_SKIP	5
 
+#define SYSCALL_FAULT_USER_MAX 128
+
 /*
  * syscalls are special, and need special handling, this is why
  * they are not included in trace_entries.h
@@ -430,6 +432,7 @@ struct trace_array {
 	int			function_enabled;
 #endif
 	int			no_filter_buffering_ref;
+	unsigned int		syscall_buf_sz;
 	struct list_head	hist_vars;
 #ifdef CONFIG_TRACER_SNAPSHOT
 	struct cond_snapshot	*cond_snapshot;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index b39fa9dd1067..e9162165c4d2 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -407,17 +407,16 @@ struct syscall_buf_info {
  * SYSCALL_FAULT_USER_MAX is the amount to copy into the ring buffer.
  *  It's slightly smaller than SYSCALL_FAULT_ARG_SZ to know if it
  *  needs to append the EXTRA or not.
+ *  (defined in kernel/trace/trace.h)
  *
  * This only allows up to 3 args from system calls.
  */
 #define SYSCALL_FAULT_BUF_SZ 512
 #define SYSCALL_FAULT_ARG_SZ 168
-#define SYSCALL_FAULT_USER_MAX 128
 #define SYSCALL_FAULT_MAX_CNT 3
 
 static struct syscall_buf_info *syscall_buffer;
 static DEFINE_PER_CPU(unsigned long, sched_switch_cnt);
-
 static int syscall_fault_buffer_cnt;
 
 static void syscall_fault_buffer_free(struct syscall_buf_info *sinfo)
@@ -524,7 +523,7 @@ static void syscall_fault_buffer_disable(void)
 	call_rcu_tasks_trace(&sinfo->rcu, rcu_free_syscall_buffer);
 }
 
-static char *sys_fault_user(struct syscall_metadata *sys_data,
+static char *sys_fault_user(struct trace_array *tr, struct syscall_metadata *sys_data,
 			    struct syscall_buf_info *sinfo,
 			    unsigned long *args,
 			    unsigned int data_size[SYSCALL_FAULT_MAX_CNT])
@@ -576,6 +575,10 @@ static char *sys_fault_user(struct syscall_metadata *sys_data,
 		data_size[i] = -1; /* Denotes no pointer */
 	}
 
+	/* A zero size means do not even try */
+	if (!tr->syscall_buf_sz)
+		return buffer;
+
  again:
 	/*
 	 * If this task is preempted by another user space task, it
@@ -659,19 +662,20 @@ static char *sys_fault_user(struct syscall_metadata *sys_data,
 					buf[x] = '.';
 			}
 
+			size = min(tr->syscall_buf_sz, SYSCALL_FAULT_USER_MAX);
+
 			/*
 			 * If the text was truncated due to our max limit,
 			 * add "..." to the string.
 			 */
-			if (ret > SYSCALL_FAULT_USER_MAX) {
-				strscpy(buf + SYSCALL_FAULT_USER_MAX, EXTRA,
-					sizeof(EXTRA));
-				ret = SYSCALL_FAULT_USER_MAX + sizeof(EXTRA);
+			if (ret > size) {
+				strscpy(buf + size, EXTRA, sizeof(EXTRA));
+				ret = size + sizeof(EXTRA);
 			} else {
 				buf[ret++] = '\0';
 			}
 		} else {
-			ret = min(ret, SYSCALL_FAULT_USER_MAX);
+			ret = min((unsigned int)ret, tr->syscall_buf_sz);
 		}
 		data_size[i] = ret;
 	}
@@ -731,7 +735,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 		if (!sinfo)
 			return;
 
-		user_ptr = sys_fault_user(sys_data, sinfo, args, user_sizes);
+		user_ptr = sys_fault_user(tr, sys_data, sinfo, args, user_sizes);
 		/*
 		 * user_size is the amount of data to append.
 		 * Need to add 4 for the meta field that points to
-- 
2.47.2

Re: [PATCH 7/7] tracing: Add syscall_user_buf_size to limit amount written

Posted by kernel test robot 1 month, 4 weeks ago

Hi Steven,

kernel test robot noticed the following build errors:

[auto build test ERROR on trace/for-next]
[also build test ERROR on linus/master v6.16 next-20250806]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Steven-Rostedt/tracing-Replace-syscall-RCU-pointer-assignment-with-READ-WRITE_ONCE/20250806-122312
base:   https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace for-next
patch link:    https://lore.kernel.org/r/20250805193235.747004484%40kernel.org
patch subject: [PATCH 7/7] tracing: Add syscall_user_buf_size to limit amount written
config: hexagon-randconfig-002-20250806 (https://download.01.org/0day-ci/archive/20250806/202508062211.cwYqtLu0-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 7b8dea265e72c3037b6b1e54d5ab51b7e14f328b)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250806/202508062211.cwYqtLu0-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202508062211.cwYqtLu0-lkp@intel.com/

All errors (new ones prefixed by >>):

>> kernel/trace/trace.c:11128:32: error: use of undeclared identifier 'CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT'
    11128 |         global_trace.syscall_buf_sz = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
          |                                       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   1 error generated.


vim +/CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT +11128 kernel/trace/trace.c

 11110	
 11111		init_trace_flags_index(&global_trace);
 11112	
 11113		register_tracer(&nop_trace);
 11114	
 11115		/* Function tracing may start here (via kernel command line) */
 11116		init_function_trace();
 11117	
 11118		/* All seems OK, enable tracing */
 11119		tracing_disabled = 0;
 11120	
 11121		atomic_notifier_chain_register(&panic_notifier_list,
 11122					       &trace_panic_notifier);
 11123	
 11124		register_die_notifier(&trace_die_notifier);
 11125	
 11126		global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
 11127	
 11128		global_trace.syscall_buf_sz = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
 11129	
 11130		INIT_LIST_HEAD(&global_trace.systems);
 11131		INIT_LIST_HEAD(&global_trace.events);
 11132		INIT_LIST_HEAD(&global_trace.hist_vars);
 11133		INIT_LIST_HEAD(&global_trace.err_log);
 11134		list_add(&global_trace.marker_list, &marker_copies);
 11135		list_add(&global_trace.list, &ftrace_trace_arrays);
 11136	
 11137		apply_trace_boot_options();
 11138	
 11139		register_snapshot_cmd();
 11140	
 11141		return 0;
 11142	
 11143	out_free_pipe_cpumask:
 11144		free_cpumask_var(global_trace.pipe_cpumask);
 11145	out_free_savedcmd:
 11146		trace_free_saved_cmdlines_buffer();
 11147	out_free_temp_buffer:
 11148		ring_buffer_free(temp_buffer);
 11149	out_rm_hp_state:
 11150		cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE);
 11151	out_free_cpumask:
 11152		free_cpumask_var(global_trace.tracing_cpumask);
 11153	out_free_buffer_mask:
 11154		free_cpumask_var(tracing_buffer_mask);
 11155		return ret;
 11156	}
 11157	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH 7/7] tracing: Add syscall_user_buf_size to limit amount written

Posted by Douglas Raillard 1 month, 4 weeks ago

On 05-08-2025 20:26, Steven Rostedt wrote:
> From: Steven Rostedt <rostedt@goodmis.org>
> 
> When a system call that reads user space addresses copy it to the ring
> buffer, it can copy up to 511 bytes of data. This can waste precious ring
> buffer space if the user isn't interested in the output. Add a new file
> "syscall_user_buf_size" that gets initialized to a new config
> CONFIG_SYSCALL_BUF_SIZE_DEFAULT that defaults to 128.

Have you considered dynamically removing some event fields ? We routinely hit
the same problem with some of our events that have rarely-used large fields.

If we could have a "fields" file in /sys/kernel/tracing/events/*/*/fields
that allowed selecting what field is needed that would be amazing. I had plans
to build something like that in our kernel module based on the synthetic events API,
but did not proceed as that API is not exported in a useful way.

> 
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
>   Documentation/trace/ftrace.rst |  7 +++++
>   kernel/trace/Kconfig           | 13 +++++++++
>   kernel/trace/trace.c           | 52 ++++++++++++++++++++++++++++++++++
>   kernel/trace/trace.h           |  3 ++
>   kernel/trace/trace_syscalls.c  | 22 ++++++++------
>   5 files changed, 88 insertions(+), 9 deletions(-)
> 
> diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
> index af66a05e18cc..4712bbfcfd08 100644
> --- a/Documentation/trace/ftrace.rst
> +++ b/Documentation/trace/ftrace.rst
> @@ -366,6 +366,13 @@ of ftrace. Here is a list of some of the key files:
>   	for each function. The displayed address is the patch-site address
>   	and can differ from /proc/kallsyms address.
>   
> +  syscall_user_buf_size:
> +
> +	Some system call trace events will record the data from a user
> +	space address that one of the parameters point to. The amount of
> +	data per event is limited. This file holds the max number of bytes
> +	that will be recorded into the ring buffer to hold this data.
> +
>     dyn_ftrace_total_info:
>   
>   	This file is for debugging purposes. The number of functions that
> diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
> index f80298e6aa16..aa28d7ca3e31 100644
> --- a/kernel/trace/Kconfig
> +++ b/kernel/trace/Kconfig
> @@ -574,6 +574,19 @@ config FTRACE_SYSCALLS
>   	help
>   	  Basic tracer to catch the syscall entry and exit events.
>   
> +config TRACE_SYSCALL_BUF_SIZE_DEFAULT
> +	int "System call user read max size"
> +	range 0 128
> +	default 128
> +	depends on FTRACE_SYSCALLS
> +	help
> +	 Some system call trace events will record the data from a user
> +	 space address that one of the parameters point to. The amount of
> +	 data per event is limited. It may be further limited by this
> +	 config and later changed by writing an ASCII number into:
> +
> +	   /sys/kernel/tracing/syscall_user_buf_size
> +
>   config TRACER_SNAPSHOT
>   	bool "Create a snapshot trace buffer"
>   	select TRACER_MAX_TRACE
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index d0b1964648c1..1db708ed0625 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -6913,6 +6913,43 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
>   	goto out;
>   }
>   
> +static ssize_t
> +tracing_syscall_buf_read(struct file *filp, char __user *ubuf,
> +			 size_t cnt, loff_t *ppos)
> +{
> +	struct inode *inode = file_inode(filp);
> +	struct trace_array *tr = inode->i_private;
> +	char buf[64];
> +	int r;
> +
> +	r = snprintf(buf, 64, "%d\n", tr->syscall_buf_sz);
> +
> +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
> +}
> +
> +static ssize_t
> +tracing_syscall_buf_write(struct file *filp, const char __user *ubuf,
> +			  size_t cnt, loff_t *ppos)
> +{
> +	struct inode *inode = file_inode(filp);
> +	struct trace_array *tr = inode->i_private;
> +	unsigned long val;
> +	int ret;
> +
> +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
> +	if (ret)
> +		return ret;
> +
> +	if (val > SYSCALL_FAULT_USER_MAX)
> +		val = SYSCALL_FAULT_USER_MAX;
> +
> +	tr->syscall_buf_sz = val;
> +
> +	*ppos += cnt;
> +
> +	return cnt;
> +}
> +
>   static ssize_t
>   tracing_entries_read(struct file *filp, char __user *ubuf,
>   		     size_t cnt, loff_t *ppos)
> @@ -7737,6 +7774,14 @@ static const struct file_operations tracing_entries_fops = {
>   	.release	= tracing_release_generic_tr,
>   };
>   
> +static const struct file_operations tracing_syscall_buf_fops = {
> +	.open		= tracing_open_generic_tr,
> +	.read		= tracing_syscall_buf_read,
> +	.write		= tracing_syscall_buf_write,
> +	.llseek		= generic_file_llseek,
> +	.release	= tracing_release_generic_tr,
> +};
> +
>   static const struct file_operations tracing_buffer_meta_fops = {
>   	.open		= tracing_buffer_meta_open,
>   	.read		= seq_read,
> @@ -9839,6 +9884,8 @@ trace_array_create_systems(const char *name, const char *systems,
>   
>   	raw_spin_lock_init(&tr->start_lock);
>   
> +	tr->syscall_buf_sz = global_trace.syscall_buf_sz;
> +
>   	tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
>   #ifdef CONFIG_TRACER_MAX_TRACE
>   	spin_lock_init(&tr->snapshot_trigger_lock);
> @@ -10155,6 +10202,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
>   	trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
>   			  tr, &buffer_subbuf_size_fops);
>   
> +	trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer,
> +			 tr, &tracing_syscall_buf_fops);
> +
>   	create_trace_options_dir(tr);
>   
>   #ifdef CONFIG_TRACER_MAX_TRACE
> @@ -11081,6 +11131,8 @@ __init static int tracer_alloc_buffers(void)
>   
>   	global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
>   
> +	global_trace.syscall_buf_sz = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
> +
>   	INIT_LIST_HEAD(&global_trace.systems);
>   	INIT_LIST_HEAD(&global_trace.events);
>   	INIT_LIST_HEAD(&global_trace.hist_vars);
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 1dbf1d3cf2f1..1b3e464619f0 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -131,6 +131,8 @@ enum trace_type {
>   #define HIST_STACKTRACE_SIZE	(HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
>   #define HIST_STACKTRACE_SKIP	5
>   
> +#define SYSCALL_FAULT_USER_MAX 128
> +
>   /*
>    * syscalls are special, and need special handling, this is why
>    * they are not included in trace_entries.h
> @@ -430,6 +432,7 @@ struct trace_array {
>   	int			function_enabled;
>   #endif
>   	int			no_filter_buffering_ref;
> +	unsigned int		syscall_buf_sz;
>   	struct list_head	hist_vars;
>   #ifdef CONFIG_TRACER_SNAPSHOT
>   	struct cond_snapshot	*cond_snapshot;
> diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
> index b39fa9dd1067..e9162165c4d2 100644
> --- a/kernel/trace/trace_syscalls.c
> +++ b/kernel/trace/trace_syscalls.c
> @@ -407,17 +407,16 @@ struct syscall_buf_info {
>    * SYSCALL_FAULT_USER_MAX is the amount to copy into the ring buffer.
>    *  It's slightly smaller than SYSCALL_FAULT_ARG_SZ to know if it
>    *  needs to append the EXTRA or not.
> + *  (defined in kernel/trace/trace.h)
>    *
>    * This only allows up to 3 args from system calls.
>    */
>   #define SYSCALL_FAULT_BUF_SZ 512
>   #define SYSCALL_FAULT_ARG_SZ 168
> -#define SYSCALL_FAULT_USER_MAX 128
>   #define SYSCALL_FAULT_MAX_CNT 3
>   
>   static struct syscall_buf_info *syscall_buffer;
>   static DEFINE_PER_CPU(unsigned long, sched_switch_cnt);
> -
>   static int syscall_fault_buffer_cnt;
>   
>   static void syscall_fault_buffer_free(struct syscall_buf_info *sinfo)
> @@ -524,7 +523,7 @@ static void syscall_fault_buffer_disable(void)
>   	call_rcu_tasks_trace(&sinfo->rcu, rcu_free_syscall_buffer);
>   }
>   
> -static char *sys_fault_user(struct syscall_metadata *sys_data,
> +static char *sys_fault_user(struct trace_array *tr, struct syscall_metadata *sys_data,
>   			    struct syscall_buf_info *sinfo,
>   			    unsigned long *args,
>   			    unsigned int data_size[SYSCALL_FAULT_MAX_CNT])
> @@ -576,6 +575,10 @@ static char *sys_fault_user(struct syscall_metadata *sys_data,
>   		data_size[i] = -1; /* Denotes no pointer */
>   	}
>   
> +	/* A zero size means do not even try */
> +	if (!tr->syscall_buf_sz)
> +		return buffer;
> +
>    again:
>   	/*
>   	 * If this task is preempted by another user space task, it
> @@ -659,19 +662,20 @@ static char *sys_fault_user(struct syscall_metadata *sys_data,
>   					buf[x] = '.';
>   			}
>   
> +			size = min(tr->syscall_buf_sz, SYSCALL_FAULT_USER_MAX);
> +
>   			/*
>   			 * If the text was truncated due to our max limit,
>   			 * add "..." to the string.
>   			 */
> -			if (ret > SYSCALL_FAULT_USER_MAX) {
> -				strscpy(buf + SYSCALL_FAULT_USER_MAX, EXTRA,
> -					sizeof(EXTRA));
> -				ret = SYSCALL_FAULT_USER_MAX + sizeof(EXTRA);
> +			if (ret > size) {
> +				strscpy(buf + size, EXTRA, sizeof(EXTRA));
> +				ret = size + sizeof(EXTRA);
>   			} else {
>   				buf[ret++] = '\0';
>   			}
>   		} else {
> -			ret = min(ret, SYSCALL_FAULT_USER_MAX);
> +			ret = min((unsigned int)ret, tr->syscall_buf_sz);
>   		}
>   		data_size[i] = ret;
>   	}
> @@ -731,7 +735,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
>   		if (!sinfo)
>   			return;
>   
> -		user_ptr = sys_fault_user(sys_data, sinfo, args, user_sizes);
> +		user_ptr = sys_fault_user(tr, sys_data, sinfo, args, user_sizes);
>   		/*
>   		 * user_size is the amount of data to append.
>   		 * Need to add 4 for the meta field that points to

--
Douglas

Re: [PATCH 7/7] tracing: Add syscall_user_buf_size to limit amount written

Posted by Steven Rostedt 1 month, 4 weeks ago

On Wed, 6 Aug 2025 11:50:06 +0100
Douglas Raillard <douglas.raillard@arm.com> wrote:

> On 05-08-2025 20:26, Steven Rostedt wrote:
> > From: Steven Rostedt <rostedt@goodmis.org>
> > 
> > When a system call that reads user space addresses copy it to the ring
> > buffer, it can copy up to 511 bytes of data. This can waste precious ring
> > buffer space if the user isn't interested in the output. Add a new file
> > "syscall_user_buf_size" that gets initialized to a new config
> > CONFIG_SYSCALL_BUF_SIZE_DEFAULT that defaults to 128.  
> 
> Have you considered dynamically removing some event fields ? We routinely hit
> the same problem with some of our events that have rarely-used large fields.

We do that already with eprobes. Note, syscall events are pseudo events
hooked on the raw_syscall events. Thus modifying what is displayed is
trivial as it's done manually anyway. For normal events, it's all in
the TRACE_EVENT() macro which defines the fields at boot. Trying to
modify it later is very difficult.

> 
> If we could have a "fields" file in /sys/kernel/tracing/events/*/*/fields
> that allowed selecting what field is needed that would be amazing. I had plans
> to build something like that in our kernel module based on the synthetic events API,
> but did not proceed as that API is not exported in a useful way.

Take a look at eprobes. You can make a new event based from an existing
event (including other dynamic events and syscalls).

I finally got around to adding documentation about it:

  https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/trace/eprobetrace.rst

-- Steve

Re: [PATCH 7/7] tracing: Add syscall_user_buf_size to limit amount written

Posted by Douglas Raillard 1 month, 4 weeks ago

On 06-08-2025 13:43, Steven Rostedt wrote:
> On Wed, 6 Aug 2025 11:50:06 +0100
> Douglas Raillard <douglas.raillard@arm.com> wrote:
> 
>> On 05-08-2025 20:26, Steven Rostedt wrote:
>>> From: Steven Rostedt <rostedt@goodmis.org>
>>>
>>> When a system call that reads user space addresses copy it to the ring
>>> buffer, it can copy up to 511 bytes of data. This can waste precious ring
>>> buffer space if the user isn't interested in the output. Add a new file
>>> "syscall_user_buf_size" that gets initialized to a new config
>>> CONFIG_SYSCALL_BUF_SIZE_DEFAULT that defaults to 128.
>>
>> Have you considered dynamically removing some event fields ? We routinely hit
>> the same problem with some of our events that have rarely-used large fields.
> 
> We do that already with eprobes. Note, syscall events are pseudo events
> hooked on the raw_syscall events. Thus modifying what is displayed is
> trivial as it's done manually anyway. For normal events, it's all in
> the TRACE_EVENT() macro which defines the fields at boot. Trying to
> modify it later is very difficult.

I was thinking at a filtering step between assigning to an event struct
with TP_fast_assign and actually writing it to the buffer. An array of (offset, size)
would allow selecting which field is to be copied to the buffer, the rest would
be left out (a bit like in some parts of the synthetic event API). The format
file would be impacted to remove some fields, but hopefully not too many other
corners of ftrace.

The advantage of that over eprobe would be:
1. full support of all field types
2. probably lower overhead than the fetch_op interpreter, but maybe not by much.
3. less moving pieces for the user (e.g. no need to have BTF for by-name field access,
    no new event name to come up with etc.)

> 
>>
>> If we could have a "fields" file in /sys/kernel/tracing/events/*/*/fields
>> that allowed selecting what field is needed that would be amazing. I had plans
>> to build something like that in our kernel module based on the synthetic events API,
>> but did not proceed as that API is not exported in a useful way.
> 
> Take a look at eprobes. You can make a new event based from an existing
> event (including other dynamic events and syscalls).
> I finally got around to adding documentation about it:
> 
>    https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/trace/eprobetrace.rst
> 

That's very interesting, I did not realize that you could access the actual event fields
and not just the tracepoint args. With your recent BTF patch, there is now little limits
on how deep you can drill down in the structs which is great (and actually more powerful
than the original event itself).

Before userspace tooling could make use of that as a field filtering system, a few friction
points would need to be addressed:

1. Getting the field list programmatically is currently not really possible as dealing with
    the format file is very tricky. We could just pass on the user-requested field
    to the kernel but that would prevent userspace validation with usable error reporting
    (the 6.15 kernel I tried it on gave me EINVAL and not even a dmesg error when trying to use
    a field that does not exist)

2. The type of the field is not inferred, e.g. an explicit ":string" is needed here:

      e:my/sched_switch sched.sched_switch prev_comm=$prev_comm:string

    The only place a tool can get this info from is the format file, which means you have to
    parse it and apply some conversions (e.g. "__data_loc char[]" becomes "string").

3. Only a restricted subset of field types is supported, e.g. no cpumask, buffers other
    than strings etc. In practice, this means the userspace tooling will have to either:
      * pass on the restriction to the users (can easily lead to a terrible UX by misleading
        the user to think filtering is generally available when in fact it's not).
      * or only treat that as a hint and use the unfiltered original event if the user asks
        for a field with an unsupported type.

On the bright side, creating a new event like "e:my/sched_switch" gives the event name "sched_switch" but
trace-cmd start -e my/sched_switch will only enable the new event which is exactly what we need.
This way, the trace can look like a normal one except less fields, so downstream data processing
is not impacted and only the data-gathering step needs to know about it.

Depending on whether we want/can deal with those friction point, it could either become a high-level
layer usable like the base event system with extra low-level abilities, or stay as a tool only suitable for
hand-crafted use cases where the user has deeper knowledge of layout on all involved kernels.

On a related note, if we wanted to make something that allowed reducing the amount of stored data and
that could deeply integrate with the userspace tooling in charge of collecting the data to run a user-defined query,
the best bet is to target SQL-like systems. That family is very established and virtually all trace-processing system
will use it as first stage (e.g. Perfetto with sqlite, or LISA with Polars dataframes).
In those systems, some important information can typically be extracted from the user query [1]:

1. Projection: which tables and columns the query needs. In ftrace, that's the list of events and what fields
    are needed. Other events/fields can be discarded as they won't be read by the query.

2. Row limit: how many rows the query will read (not always available obviously). In ftrace, that would allow
    automatically stopping the tracing when the event count reaches a limit, or set the buffer size based on
    the event size for a flight-recorder approach. Additional event occurrences would be discarded by the query
    anyway.

3. Predicate filtering: If the query contains a filter to only select rows with a column equal to a specific
    value. Other rows don't need to be collected as the query will discard them anyway.

Currently:
1. is partially implemented as you can select specific events, but not what field you want.
2. is partially implemented (buffer size, but AFAIK there is no way of telling ftrace to stop tracing after N events).
3. is fully implemented with /sys/kernel/debug/tracing/events/*/*/filter

If all those are implemented, ftrace would be able to make use of the most important implicit info available
in the user query to limit the collected data size, without the user having to tune anything manually
and without turning the kernel into a full-blown SQL interpreter.

[1] In the Polars dataframe library, data sources such as a parquet file served over HTTP are called "scans".
     When Polars executes an expression, it will get the data from the scans the expression refers to,
     and will pass the 3 pieces of info to the scan implementation so that processed data size can be minimized
     as early as possible in the pipeline. This is referred to as "projection pushdown", "slice pushdown" and "predicate pushdown":
     https://docs.pola.rs/user-guide/lazy/optimizations/
     If some filtering condition is too complex to express in the limited scan predicate language, filtering will happen
     later in the pipeline. If the scan does not have a smart way to apply the filter (e.g. projection pushdown for a row-oriented file format
     will probably not bring massive speed improvements) then more data than necessary will be fetched and filtering will happen
     later in the pipeline.

> -- Steve

--
Douglas

[PATCH 1/7] tracing: Replace syscall RCU pointer assignment with READ/WRITE_ONCE()
[PATCH 2/7] tracing: Have syscall trace events show "0x" for values greater than 10
[PATCH 3/7] tracing: Have syscall trace events read user space string
[PATCH 4/7] tracing: Have system call events record user array data
[PATCH 5/7] tracing: Display some syscall arrays as strings
[PATCH 6/7] tracing: Allow syscall trace events to read more than one user parameter
[PATCH 7/7] tracing: Add syscall_user_buf_size to limit amount written