[PATCH] perf trace: Skip internal syscall arguments

Namhyung Kim posted 1 patch 4 days, 16 hours ago
tools/perf/builtin-trace.c | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
[PATCH] perf trace: Skip internal syscall arguments
Posted by Namhyung Kim 4 days, 16 hours ago
Recent changes in the linux-next kernel will add new field for syscalls
to have contents in the userspace like below.

  # cat /sys/kernel/tracing/events/syscalls/sys_enter_write/format
  name: sys_enter_write
  ID: 758
  format:
          field:unsigned short common_type;       offset:0;       size:2; signed:0;
          field:unsigned char common_flags;       offset:2;       size:1; signed:0;
          field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
          field:int common_pid;   offset:4;       size:4; signed:1;

          field:int __syscall_nr; offset:8;       size:4; signed:1;
          field:unsigned int fd;  offset:16;      size:8; signed:0;
          field:const char * buf; offset:24;      size:8; signed:0;
          field:size_t count;     offset:32;      size:8; signed:0;
          field:__data_loc char[] __buf_val;      offset:40;      size:4; signed:0;

  print fmt: "fd: 0x%08lx, buf: 0x%08lx (%s), count: 0x%08lx", ((unsigned long)(REC->fd)),
             ((unsigned long)(REC->buf)), __print_dynamic_array(__buf_val, 1),
             ((unsigned long)(REC->count))

We have a different way to handle those arguments and this change
confuses perf trace then make some tests failing.  Fix it by skipping
the new fields that have "__data_loc char[]" type.

Maybe we can switch to this instead of the BPF augmentation later.

Reported-by: Thomas Richter <tmricht@linux.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Howard Chu <howardchu95@gmail.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-trace.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index a743bda294bd3400..baee1f6956001d86 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2069,6 +2069,15 @@ static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *n
        return __syscall_arg_fmt__find_by_name(syscall_arg_fmts__by_name, nmemb, name);
 }
 
+/*
+ * v6.19 kernel added new fields to read userspace memory for event tracing.
+ * But it's not used by perf and confuses the syscall parameters.
+ */
+static bool is_internal_field(struct tep_format_field *field)
+{
+	return !strcmp(field->type, "__data_loc char[]");
+}
+
 static struct tep_format_field *
 syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field,
 			    bool *use_btf)
@@ -2077,6 +2086,10 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field
 	int len;
 
 	for (; field; field = field->next, ++arg) {
+		/* assume it's the last argument */
+		if (is_internal_field(field))
+			continue;
+
 		last_field = field;
 
 		if (arg->scnprintf)
@@ -2145,6 +2158,7 @@ static int syscall__read_info(struct syscall *sc, struct trace *trace)
 {
 	char tp_name[128];
 	const char *name;
+	struct tep_format_field *field;
 	int err;
 
 	if (sc->nonexistent)
@@ -2201,6 +2215,13 @@ static int syscall__read_info(struct syscall *sc, struct trace *trace)
 		--sc->nr_args;
 	}
 
+	field = sc->args;
+	while (field) {
+		if (is_internal_field(field))
+			--sc->nr_args;
+		field = field->next;
+	}
+
 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
 	sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
 
-- 
2.52.0.487.g5c8c507ade-goog
Re: [PATCH] perf trace: Skip internal syscall arguments
Posted by Steven Rostedt 3 days, 20 hours ago
On Wed, 26 Nov 2025 20:44:18 -0800
Namhyung Kim <namhyung@kernel.org> wrote:

> Recent changes in the linux-next kernel will add new field for syscalls
> to have contents in the userspace like below.
> 
>   # cat /sys/kernel/tracing/events/syscalls/sys_enter_write/format
>   name: sys_enter_write
>   ID: 758
>   format:
>           field:unsigned short common_type;       offset:0;       size:2; signed:0;
>           field:unsigned char common_flags;       offset:2;       size:1; signed:0;
>           field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
>           field:int common_pid;   offset:4;       size:4; signed:1;
> 
>           field:int __syscall_nr; offset:8;       size:4; signed:1;
>           field:unsigned int fd;  offset:16;      size:8; signed:0;
>           field:const char * buf; offset:24;      size:8; signed:0;
>           field:size_t count;     offset:32;      size:8; signed:0;
>           field:__data_loc char[] __buf_val;      offset:40;      size:4; signed:0;
> 
>   print fmt: "fd: 0x%08lx, buf: 0x%08lx (%s), count: 0x%08lx", ((unsigned long)(REC->fd)),
>              ((unsigned long)(REC->buf)), __print_dynamic_array(__buf_val, 1),
>              ((unsigned long)(REC->count))
> 
> We have a different way to handle those arguments and this change
> confuses perf trace then make some tests failing.  Fix it by skipping
> the new fields that have "__data_loc char[]" type.
> 
> Maybe we can switch to this instead of the BPF augmentation later.
> 

Even with this patch applied, I still have a segfault with this command:

# ./perf trace -e syscalls:sys_enter_write 
     0.000 sshd-session/5421 syscalls:sys_enter_write(perf: Segmentation fault
    #0 0x560ea815187a in dump_stack debug.c:366
    #1 0x560ea81518f0 in sighandler_dump_stack debug.c:378
    #2 0x7fb5e14d1df0 in __restore_rt libc_sigaction.c:0
    #3 0x560ea7fc0cec in syscall_arg__scnprintf_buf builtin-trace.c:1857
    #4 0x560ea7fc2692 in syscall_arg_fmt__scnprintf_val builtin-trace.c:2398
    #5 0x560ea7fc2aad in syscall__scnprintf_args builtin-trace.c:2476
    #6 0x560ea7fc3ea2 in trace__fprintf_sys_enter builtin-trace.c:2885
    #7 0x560ea7fc57a5 in trace__event_handler builtin-trace.c:3312
    #8 0x560ea7fc68dd in trace__handle_event builtin-trace.c:3649
    #9 0x560ea7fc7f7a in __trace__deliver_event builtin-trace.c:4183
    #10 0x560ea7fc80cc in trace__deliver_event builtin-trace.c:4209
    #11 0x560ea7fc92b7 in trace__run builtin-trace.c:4577
    #12 0x560ea7fcd648 in cmd_trace builtin-trace.c:5773
    #13 0x560ea7fd35e3 in run_builtin perf.c:349
    #14 0x560ea7fd387b in handle_internal_command perf.c:401
    #15 0x560ea7fd39d4 in run_argv perf.c:448
    #16 0x560ea7fd3d1d in main perf.c:555
    #17 0x7fb5e14bbca8 in __libc_start_call_main libc_start_call_main.h:74
    #18 0x7fb5e14bbd65 in __libc_start_main@@GLIBC_2.34 libc-start.c:128
    #19 0x560ea7f25f41 in _start perf[53f41]
Segmentation fault

This doesn't crash in a kernel without the __data_loc.

-- Steve
Re: [PATCH] perf trace: Skip internal syscall arguments
Posted by Namhyung Kim 2 days, 2 hours ago
On Thu, Nov 27, 2025 at 08:30:52PM -0500, Steven Rostedt wrote:
> On Wed, 26 Nov 2025 20:44:18 -0800
> Namhyung Kim <namhyung@kernel.org> wrote:
> 
> > Recent changes in the linux-next kernel will add new field for syscalls
> > to have contents in the userspace like below.
> > 
> >   # cat /sys/kernel/tracing/events/syscalls/sys_enter_write/format
> >   name: sys_enter_write
> >   ID: 758
> >   format:
> >           field:unsigned short common_type;       offset:0;       size:2; signed:0;
> >           field:unsigned char common_flags;       offset:2;       size:1; signed:0;
> >           field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
> >           field:int common_pid;   offset:4;       size:4; signed:1;
> > 
> >           field:int __syscall_nr; offset:8;       size:4; signed:1;
> >           field:unsigned int fd;  offset:16;      size:8; signed:0;
> >           field:const char * buf; offset:24;      size:8; signed:0;
> >           field:size_t count;     offset:32;      size:8; signed:0;
> >           field:__data_loc char[] __buf_val;      offset:40;      size:4; signed:0;
> > 
> >   print fmt: "fd: 0x%08lx, buf: 0x%08lx (%s), count: 0x%08lx", ((unsigned long)(REC->fd)),
> >              ((unsigned long)(REC->buf)), __print_dynamic_array(__buf_val, 1),
> >              ((unsigned long)(REC->count))
> > 
> > We have a different way to handle those arguments and this change
> > confuses perf trace then make some tests failing.  Fix it by skipping
> > the new fields that have "__data_loc char[]" type.
> > 
> > Maybe we can switch to this instead of the BPF augmentation later.
> > 
> 
> Even with this patch applied, I still have a segfault with this command:
> 
> # ./perf trace -e syscalls:sys_enter_write 

What about this?

  # ./perf trace -e write

Thanks,
Namhyung


>      0.000 sshd-session/5421 syscalls:sys_enter_write(perf: Segmentation fault
>     #0 0x560ea815187a in dump_stack debug.c:366
>     #1 0x560ea81518f0 in sighandler_dump_stack debug.c:378
>     #2 0x7fb5e14d1df0 in __restore_rt libc_sigaction.c:0
>     #3 0x560ea7fc0cec in syscall_arg__scnprintf_buf builtin-trace.c:1857
>     #4 0x560ea7fc2692 in syscall_arg_fmt__scnprintf_val builtin-trace.c:2398
>     #5 0x560ea7fc2aad in syscall__scnprintf_args builtin-trace.c:2476
>     #6 0x560ea7fc3ea2 in trace__fprintf_sys_enter builtin-trace.c:2885
>     #7 0x560ea7fc57a5 in trace__event_handler builtin-trace.c:3312
>     #8 0x560ea7fc68dd in trace__handle_event builtin-trace.c:3649
>     #9 0x560ea7fc7f7a in __trace__deliver_event builtin-trace.c:4183
>     #10 0x560ea7fc80cc in trace__deliver_event builtin-trace.c:4209
>     #11 0x560ea7fc92b7 in trace__run builtin-trace.c:4577
>     #12 0x560ea7fcd648 in cmd_trace builtin-trace.c:5773
>     #13 0x560ea7fd35e3 in run_builtin perf.c:349
>     #14 0x560ea7fd387b in handle_internal_command perf.c:401
>     #15 0x560ea7fd39d4 in run_argv perf.c:448
>     #16 0x560ea7fd3d1d in main perf.c:555
>     #17 0x7fb5e14bbca8 in __libc_start_call_main libc_start_call_main.h:74
>     #18 0x7fb5e14bbd65 in __libc_start_main@@GLIBC_2.34 libc-start.c:128
>     #19 0x560ea7f25f41 in _start perf[53f41]
> Segmentation fault
> 
> This doesn't crash in a kernel without the __data_loc.
> 
> -- Steve
Re: [PATCH] perf trace: Skip internal syscall arguments
Posted by Steven Rostedt 2 days, 1 hour ago
On Sat, 29 Nov 2025 11:07:20 -0800
Namhyung Kim <namhyung@kernel.org> wrote:

> > Even with this patch applied, I still have a segfault with this command:
> > 
> > # ./perf trace -e syscalls:sys_enter_write   
> 
> What about this?
> 
>   # ./perf trace -e write

Yes that works. As long as it isn't considered a regression that the
sys_enter_write crashes, then I'm fine with;

 Tested-by: Steven Rostedt (Google) <rostedt@goodmis.org>

-- Steve
Re: [PATCH] perf trace: Skip internal syscall arguments
Posted by Namhyung Kim 2 days, 1 hour ago
On Sat, Nov 29, 2025 at 02:42:02PM -0500, Steven Rostedt wrote:
> On Sat, 29 Nov 2025 11:07:20 -0800
> Namhyung Kim <namhyung@kernel.org> wrote:
> 
> > > Even with this patch applied, I still have a segfault with this command:
> > > 
> > > # ./perf trace -e syscalls:sys_enter_write   
> > 
> > What about this?
> > 
> >   # ./perf trace -e write
> 
> Yes that works. As long as it isn't considered a regression that the
> sys_enter_write crashes, then I'm fine with;
> 
>  Tested-by: Steven Rostedt (Google) <rostedt@goodmis.org>

Thanks, I think it's a separate issue.  Will take a look.

Namhyung
Re: [PATCH] perf trace: Skip internal syscall arguments
Posted by Thomas Richter 4 days, 14 hours ago
On 11/27/25 05:44, Namhyung Kim wrote:
> Recent changes in the linux-next kernel will add new field for syscalls
> to have contents in the userspace like below.
> 
>   # cat /sys/kernel/tracing/events/syscalls/sys_enter_write/format
>   name: sys_enter_write
>   ID: 758
>   format:
>           field:unsigned short common_type;       offset:0;       size:2; signed:0;
>           field:unsigned char common_flags;       offset:2;       size:1; signed:0;
>           field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
>           field:int common_pid;   offset:4;       size:4; signed:1;
> 
>           field:int __syscall_nr; offset:8;       size:4; signed:1;
>           field:unsigned int fd;  offset:16;      size:8; signed:0;
>           field:const char * buf; offset:24;      size:8; signed:0;
>           field:size_t count;     offset:32;      size:8; signed:0;
>           field:__data_loc char[] __buf_val;      offset:40;      size:4; signed:0;
> 
>   print fmt: "fd: 0x%08lx, buf: 0x%08lx (%s), count: 0x%08lx", ((unsigned long)(REC->fd)),
>              ((unsigned long)(REC->buf)), __print_dynamic_array(__buf_val, 1),
>              ((unsigned long)(REC->count))
> 
> We have a different way to handle those arguments and this change
> confuses perf trace then make some tests failing.  Fix it by skipping
> the new fields that have "__data_loc char[]" type.
> 
> Maybe we can switch to this instead of the BPF augmentation later.
> 
> Reported-by: Thomas Richter <tmricht@linux.ibm.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Howard Chu <howardchu95@gmail.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  tools/perf/builtin-trace.c | 21 +++++++++++++++++++++
>  1 file changed, 21 insertions(+)
> 
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index a743bda294bd3400..baee1f6956001d86 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -2069,6 +2069,15 @@ static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *n
>         return __syscall_arg_fmt__find_by_name(syscall_arg_fmts__by_name, nmemb, name);
>  }
>  
> +/*
> + * v6.19 kernel added new fields to read userspace memory for event tracing.
> + * But it's not used by perf and confuses the syscall parameters.
> + */
> +static bool is_internal_field(struct tep_format_field *field)
> +{
> +	return !strcmp(field->type, "__data_loc char[]");
> +}
> +
>  static struct tep_format_field *
>  syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field,
>  			    bool *use_btf)
> @@ -2077,6 +2086,10 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field
>  	int len;
>  
>  	for (; field; field = field->next, ++arg) {
> +		/* assume it's the last argument */
> +		if (is_internal_field(field))
> +			continue;
> +
>  		last_field = field;
>  
>  		if (arg->scnprintf)
> @@ -2145,6 +2158,7 @@ static int syscall__read_info(struct syscall *sc, struct trace *trace)
>  {
>  	char tp_name[128];
>  	const char *name;
> +	struct tep_format_field *field;
>  	int err;
>  
>  	if (sc->nonexistent)
> @@ -2201,6 +2215,13 @@ static int syscall__read_info(struct syscall *sc, struct trace *trace)
>  		--sc->nr_args;
>  	}
>  
> +	field = sc->args;
> +	while (field) {
> +		if (is_internal_field(field))
> +			--sc->nr_args;
> +		field = field->next;
> +	}
> +
>  	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
>  	sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
>  

With the patch it succeeds again:
❯ ./perf test -F 'perf trace BTF general tests'
Checking if vmlinux BTF exists
Testing perf trace's string augmentation
Testing perf trace's buffer augmentation
Testing perf trace's struct augmentation
110: perf trace BTF general tests                                    : Ok
>

Tested-by: Thomas Richter <tmricht@linux.ibm.com>
-- 
Thomas Richter, Dept 3303, IBM s390 Linux Development, Boeblingen, Germany
--
IBM Deutschland Research & Development GmbH

Vorsitzender des Aufsichtsrats: Wolfgang Wendt

Geschäftsführung: David Faller

Sitz der Gesellschaft: Böblingen / Registergericht: Amtsgericht Stuttgart, HRB 243294
Re: [PATCH] perf trace: Skip internal syscall arguments
Posted by Howard Chu 4 days, 1 hour ago
Hi Namhyung,

On Wed, Nov 26, 2025 at 11:10 PM Thomas Richter <tmricht@linux.ibm.com> wrote:
>
> On 11/27/25 05:44, Namhyung Kim wrote:
> > Recent changes in the linux-next kernel will add new field for syscalls
> > to have contents in the userspace like below.
> >
> >   # cat /sys/kernel/tracing/events/syscalls/sys_enter_write/format
> >   name: sys_enter_write
> >   ID: 758
> >   format:
> >           field:unsigned short common_type;       offset:0;       size:2; signed:0;
> >           field:unsigned char common_flags;       offset:2;       size:1; signed:0;
> >           field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
> >           field:int common_pid;   offset:4;       size:4; signed:1;
> >
> >           field:int __syscall_nr; offset:8;       size:4; signed:1;
> >           field:unsigned int fd;  offset:16;      size:8; signed:0;
> >           field:const char * buf; offset:24;      size:8; signed:0;
> >           field:size_t count;     offset:32;      size:8; signed:0;
> >           field:__data_loc char[] __buf_val;      offset:40;      size:4; signed:0;
> >
> >   print fmt: "fd: 0x%08lx, buf: 0x%08lx (%s), count: 0x%08lx", ((unsigned long)(REC->fd)),
> >              ((unsigned long)(REC->buf)), __print_dynamic_array(__buf_val, 1),
> >              ((unsigned long)(REC->count))
> >
> > We have a different way to handle those arguments and this change
> > confuses perf trace then make some tests failing.  Fix it by skipping
> > the new fields that have "__data_loc char[]" type.
> >
> > Maybe we can switch to this instead of the BPF augmentation later.
> >
> > Reported-by: Thomas Richter <tmricht@linux.ibm.com>
> > Cc: Steven Rostedt <rostedt@goodmis.org>
> > Cc: Howard Chu <howardchu95@gmail.com>
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>

LGTM.

Reviewed-by: Howard Chu <howardchu95@gmail.com>

Thanks,
Howard

> > ---
> >  tools/perf/builtin-trace.c | 21 +++++++++++++++++++++
> >  1 file changed, 21 insertions(+)
> >
> > diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> > index a743bda294bd3400..baee1f6956001d86 100644
> > --- a/tools/perf/builtin-trace.c
> > +++ b/tools/perf/builtin-trace.c
> > @@ -2069,6 +2069,15 @@ static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *n
> >         return __syscall_arg_fmt__find_by_name(syscall_arg_fmts__by_name, nmemb, name);
> >  }
> >
> > +/*
> > + * v6.19 kernel added new fields to read userspace memory for event tracing.
> > + * But it's not used by perf and confuses the syscall parameters.
> > + */
> > +static bool is_internal_field(struct tep_format_field *field)
> > +{
> > +     return !strcmp(field->type, "__data_loc char[]");
> > +}
> > +
> >  static struct tep_format_field *
> >  syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field,
> >                           bool *use_btf)
> > @@ -2077,6 +2086,10 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field
> >       int len;
> >
> >       for (; field; field = field->next, ++arg) {
> > +             /* assume it's the last argument */
> > +             if (is_internal_field(field))
> > +                     continue;
> > +
> >               last_field = field;
> >
> >               if (arg->scnprintf)
> > @@ -2145,6 +2158,7 @@ static int syscall__read_info(struct syscall *sc, struct trace *trace)
> >  {
> >       char tp_name[128];
> >       const char *name;
> > +     struct tep_format_field *field;
> >       int err;
> >
> >       if (sc->nonexistent)
> > @@ -2201,6 +2215,13 @@ static int syscall__read_info(struct syscall *sc, struct trace *trace)
> >               --sc->nr_args;
> >       }
> >
> > +     field = sc->args;
> > +     while (field) {
> > +             if (is_internal_field(field))
> > +                     --sc->nr_args;
> > +             field = field->next;
> > +     }
> > +
> >       sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
> >       sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
> >
>
> With the patch it succeeds again:
> ❯ ./perf test -F 'perf trace BTF general tests'
> Checking if vmlinux BTF exists
> Testing perf trace's string augmentation
> Testing perf trace's buffer augmentation
> Testing perf trace's struct augmentation
> 110: perf trace BTF general tests                                    : Ok
> >
>
> Tested-by: Thomas Richter <tmricht@linux.ibm.com>
> --
> Thomas Richter, Dept 3303, IBM s390 Linux Development, Boeblingen, Germany
> --
> IBM Deutschland Research & Development GmbH
>
> Vorsitzender des Aufsichtsrats: Wolfgang Wendt
>
> Geschäftsführung: David Faller
>
> Sitz der Gesellschaft: Böblingen / Registergericht: Amtsgericht Stuttgart, HRB 243294