[PATCH v3 6/8] perf trace: Collect augmented data using BPF

Howard Chu posted 8 patches 1 year, 3 months ago
[PATCH v3 6/8] perf trace: Collect augmented data using BPF
Posted by Howard Chu 1 year, 3 months ago
Include trace_augment.h for TRACE_AUG_MAX_BUF, so that BPF reads
TRACE_AUG_MAX_BUF bytes of buffer maximum.

Determine what type of argument and how many bytes to read from user space, us ing the
value in the beauty_map. This is the relation of parameter type and its corres ponding
value in the beauty map, and how many bytes we read eventually:

string: 1                          -> size of string (till null)
struct: size of struct             -> size of struct
buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_ MAX_BUF)

After reading from user space, we output the augmented data using
bpf_perf_event_output().

If the struct augmenter, augment_sys_enter() failed, we fall back to
using bpf_tail_call().

I have to make the payload 6 times the size of augmented_arg, to pass the
BPF verifier.

Committer notes:

It works, but we need to wire it up to the userspace specialized pretty
printer, otherwise we get things like:

  root@number:~# perf trace -e connect ssh localhost
       0.000 ( 0.010 ms): :784442/784442 connect(fd: 3, uservaddr: {2,}, addrlen: 16)                          = 0
       0.016 ( 0.006 ms): :784442/784442 connect(fd: 3, uservaddr: {10,}, addrlen: 28)                         = 0
       0.033 ( 0.096 ms): :784442/784442 connect(fd: 3, uservaddr: {10,}, addrlen: 28)                         = 0
  root@localhost's password:     71.292 ( 0.037 ms): ssh/784442 connect(fd: 4, uservaddr: {1,{['/','v','a','r','/','r','u','n','/','.','h','e','i','m',],},}, addrlen: 110) = 0
      72.087 ( 0.013 ms): ssh/784442 connect(fd: 4, uservaddr: {1,{['/','v','a','r','/','r','u','n','/','.','h','e','i','m',],},}, addrlen: 110) = 0

  root@number:~#

When we used to have:

  root@number:~# perf trace -e connect ssh localhost
       0.000 ( 0.011 ms): ssh/786564 connect(fd: 3, uservaddr: { .family: INET, port: 22, addr: 127.0.0.1 }, addrlen: 16) = 0
       0.017 ( 0.006 ms): ssh/786564 connect(fd: 3, uservaddr: { .family: INET6, port: 22, addr: ::1 }, addrlen: 28) = 0
       0.263 ( 0.043 ms): ssh/786564 connect(fd: 3, uservaddr: { .family: INET6, port: 22, addr: ::1 }, addrlen: 28) = 0
      63.770 ( 0.044 ms): ssh/786564 connect(fd: 4, uservaddr: { .family: LOCAL, path: /var/run/.heim_org.h5l.kcm-socket }, addrlen: 110) = 0
      65.467 ( 0.042 ms): ssh/786564 connect(fd: 4, uservaddr: { .family: LOCAL, path: /var/run/.heim_org.h5l.kcm-socket }, addrlen: 110) = 0
  root@localhost's password:

That is closer to what strace produces:

  root@number:~# strace -e connect ssh localhost
  connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
  connect(3, {sa_family=AF_INET6, sin6_port=htons(22), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_scope_id=0}, 28) = 0
  connect(3, {sa_family=AF_INET6, sin6_port=htons(22), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_scope_id=0}, 28) = 0
  connect(4, {sa_family=AF_UNIX, sun_path="/var/run/.heim_org.h5l.kcm-socket"}, 110) = 0
  connect(4, {sa_family=AF_UNIX, sun_path="/var/run/.heim_org.h5l.kcm-socket"}, 110) = 0
  root@localhost's password:

Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240815013626.935097-10-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../bpf_skel/augmented_raw_syscalls.bpf.c     | 114 +++++++++++++++++-
 1 file changed, 113 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 0acbd74e8c76..f29a8dfca044 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -7,6 +7,8 @@
  */
 
 #include "vmlinux.h"
+#include "../trace_augment.h"
+
 #include <bpf/bpf_helpers.h>
 #include <linux/limits.h>
 
@@ -124,6 +126,25 @@ struct augmented_args_tmp {
 	__uint(max_entries, 1);
 } augmented_args_tmp SEC(".maps");
 
+struct beauty_payload_enter {
+	struct syscall_enter_args args;
+	struct augmented_arg aug_args[6];
+};
+
+struct beauty_map_enter {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, int);
+	__type(value, __u32[6]);
+	__uint(max_entries, 512);
+} beauty_map_enter SEC(".maps");
+
+struct beauty_payload_enter_map {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, int);
+	__type(value, struct beauty_payload_enter);
+	__uint(max_entries, 1);
+} beauty_payload_enter_map SEC(".maps");
+
 static inline struct augmented_args_payload *augmented_args_payload(void)
 {
 	int key = 0;
@@ -136,6 +157,11 @@ static inline int augmented__output(void *ctx, struct augmented_args_payload *ar
 	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
 }
 
+static inline int augmented__beauty_output(void *ctx, void *data, int len)
+{
+	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len);
+}
+
 static inline
 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len)
 {
@@ -372,6 +398,91 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
 	return bpf_map_lookup_elem(pids, &pid) != NULL;
 }
 
+static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
+{
+	bool augmented, do_output = false;
+	int zero = 0, size, aug_size, index, output = 0,
+	    value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
+	unsigned int nr, *beauty_map;
+	struct beauty_payload_enter *payload;
+	void *arg, *payload_offset;
+
+	/* fall back to do predefined tail call */
+	if (args == NULL)
+		return 1;
+
+	/* use syscall number to get beauty_map entry */
+	nr             = (__u32)args->syscall_nr;
+	beauty_map     = bpf_map_lookup_elem(&beauty_map_enter, &nr);
+
+	/* set up payload for output */
+	payload        = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero);
+	payload_offset = (void *)&payload->aug_args;
+
+	if (beauty_map == NULL || payload == NULL)
+		return 1;
+
+	/* copy the sys_enter header, which has the syscall_nr */
+	__builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
+
+	/*
+	 * Determine what type of argument and how many bytes to read from user space, using the
+	 * value in the beauty_map. This is the relation of parameter type and its corresponding
+	 * value in the beauty map, and how many bytes we read eventually:
+	 *
+	 * string: 1			      -> size of string
+	 * struct: size of struct	      -> size of struct
+	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
+	 */
+	for (int i = 0; i < 6; i++) {
+		arg = (void *)args->args[i];
+		augmented = false;
+		size = beauty_map[i];
+		aug_size = size; /* size of the augmented data read from user space */
+
+		if (size == 0 || arg == NULL)
+			continue;
+
+		if (size == 1) { /* string */
+			aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
+			/* minimum of 0 to pass the verifier */
+			if (aug_size < 0)
+				aug_size = 0;
+
+			augmented = true;
+		} else if (size > 0 && size <= value_size) { /* struct */
+			if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
+				augmented = true;
+		} else if (size < 0 && size >= -6) { /* buffer */
+			index = -(size + 1);
+			aug_size = args->args[index];
+
+			if (aug_size > TRACE_AUG_MAX_BUF)
+				aug_size = TRACE_AUG_MAX_BUF;
+
+			if (aug_size > 0) {
+				if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
+					augmented = true;
+			}
+		}
+
+		/* write data to payload */
+		if (augmented) {
+			int written = offsetof(struct augmented_arg, value) + aug_size;
+
+			((struct augmented_arg *)payload_offset)->size = aug_size;
+			output += written;
+			payload_offset += written;
+			do_output = true;
+		}
+	}
+
+	if (!do_output)
+		return 1;
+
+	return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);
+}
+
 SEC("tp/raw_syscalls/sys_enter")
 int sys_enter(struct syscall_enter_args *args)
 {
@@ -400,7 +511,8 @@ int sys_enter(struct syscall_enter_args *args)
 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
 	 * unaugmented tracepoint payload.
 	 */
-	bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
+	if (augment_sys_enter(args, &augmented_args->args))
+		bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
 
 	// If not found on the PROG_ARRAY syscalls map, then we're filtering it:
 	return 0;
-- 
2.45.2
Re: [PATCH v3 6/8] perf trace: Collect augmented data using BPF
Posted by Arnaldo Carvalho de Melo 1 year, 3 months ago
On Sun, Aug 25, 2024 at 12:33:20AM +0800, Howard Chu wrote:
> Include trace_augment.h for TRACE_AUG_MAX_BUF, so that BPF reads
> TRACE_AUG_MAX_BUF bytes of buffer maximum.
> 
> Determine what type of argument and how many bytes to read from user space, us ing the
> value in the beauty_map. This is the relation of parameter type and its corres ponding
> value in the beauty map, and how many bytes we read eventually:
> 
> string: 1                          -> size of string (till null)
> struct: size of struct             -> size of struct
> buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_ MAX_BUF)
> 
> After reading from user space, we output the augmented data using
> bpf_perf_event_output().
> 
> If the struct augmenter, augment_sys_enter() failed, we fall back to
> using bpf_tail_call().
> 
> I have to make the payload 6 times the size of augmented_arg, to pass the
> BPF verifier.
> 
> Committer notes:
> 
> It works, but we need to wire it up to the userspace specialized pretty
> printer, otherwise we get things like:
> 
>   root@number:~# perf trace -e connect ssh localhost
>        0.000 ( 0.010 ms): :784442/784442 connect(fd: 3, uservaddr: {2,}, addrlen: 16)                          = 0
>        0.016 ( 0.006 ms): :784442/784442 connect(fd: 3, uservaddr: {10,}, addrlen: 28)                         = 0
>        0.033 ( 0.096 ms): :784442/784442 connect(fd: 3, uservaddr: {10,}, addrlen: 28)                         = 0
>   root@localhost's password:     71.292 ( 0.037 ms): ssh/784442 connect(fd: 4, uservaddr: {1,{['/','v','a','r','/','r','u','n','/','.','h','e','i','m',],},}, addrlen: 110) = 0
>       72.087 ( 0.013 ms): ssh/784442 connect(fd: 4, uservaddr: {1,{['/','v','a','r','/','r','u','n','/','.','h','e','i','m',],},}, addrlen: 110) = 0
> 
>   root@number:~#
> 
> When we used to have:
> 
>   root@number:~# perf trace -e connect ssh localhost
>        0.000 ( 0.011 ms): ssh/786564 connect(fd: 3, uservaddr: { .family: INET, port: 22, addr: 127.0.0.1 }, addrlen: 16) = 0
>        0.017 ( 0.006 ms): ssh/786564 connect(fd: 3, uservaddr: { .family: INET6, port: 22, addr: ::1 }, addrlen: 28) = 0
>        0.263 ( 0.043 ms): ssh/786564 connect(fd: 3, uservaddr: { .family: INET6, port: 22, addr: ::1 }, addrlen: 28) = 0
>       63.770 ( 0.044 ms): ssh/786564 connect(fd: 4, uservaddr: { .family: LOCAL, path: /var/run/.heim_org.h5l.kcm-socket }, addrlen: 110) = 0
>       65.467 ( 0.042 ms): ssh/786564 connect(fd: 4, uservaddr: { .family: LOCAL, path: /var/run/.heim_org.h5l.kcm-socket }, addrlen: 110) = 0
>   root@localhost's password:
> 
> That is closer to what strace produces:
> 
>   root@number:~# strace -e connect ssh localhost
>   connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
>   connect(3, {sa_family=AF_INET6, sin6_port=htons(22), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_scope_id=0}, 28) = 0
>   connect(3, {sa_family=AF_INET6, sin6_port=htons(22), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_scope_id=0}, 28) = 0
>   connect(4, {sa_family=AF_UNIX, sun_path="/var/run/.heim_org.h5l.kcm-socket"}, 110) = 0
>   connect(4, {sa_family=AF_UNIX, sun_path="/var/run/.heim_org.h5l.kcm-socket"}, 110) = 0
>   root@localhost's password:
> 
> Signed-off-by: Howard Chu <howardchu95@gmail.com>
> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
> Cc: Adrian Hunter <adrian.hunter@intel.com>
> Cc: Ian Rogers <irogers@google.com>
> Cc: Jiri Olsa <jolsa@kernel.org>
> Cc: Kan Liang <kan.liang@linux.intel.com>
> Cc: Namhyung Kim <namhyung@kernel.org>
> Link: https://lore.kernel.org/r/20240815013626.935097-10-howardchu95@gmail.com
> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
> ---
>  .../bpf_skel/augmented_raw_syscalls.bpf.c     | 114 +++++++++++++++++-
>  1 file changed, 113 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> index 0acbd74e8c76..f29a8dfca044 100644
> --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> @@ -7,6 +7,8 @@
>   */
>  
>  #include "vmlinux.h"
> +#include "../trace_augment.h"
> +
>  #include <bpf/bpf_helpers.h>
>  #include <linux/limits.h>
>  
> @@ -124,6 +126,25 @@ struct augmented_args_tmp {
>  	__uint(max_entries, 1);
>  } augmented_args_tmp SEC(".maps");
>  
> +struct beauty_payload_enter {
> +	struct syscall_enter_args args;
> +	struct augmented_arg aug_args[6];
> +};

⬢[acme@toolbox perf-tools-next]$ pahole -C augmented_arg /tmp/build/perf-tools-next/util/bpf_skel/.tmp/augmented_raw_syscalls.bpf.o 
struct augmented_arg {
	unsigned int               size;                 /*     0     4 */
	int                        err;                  /*     4     4 */
	char                       value[4096];          /*     8  4096 */

	/* size: 4104, cachelines: 65, members: 3 */
	/* last cacheline: 8 bytes */
};

⬢[acme@toolbox perf-tools-next]$

⬢[acme@toolbox perf-tools-next]$ pahole -C syscall_enter_args /tmp/build/perf-tools-next/util/bpf_skel/.tmp/augmented_raw_syscalls.bpf.o 
struct syscall_enter_args {
	unsigned long long         common_tp_fields;     /*     0     8 */
	long                       syscall_nr;           /*     8     8 */
	unsigned long              args[6];              /*    16    48 */

	/* size: 64, cachelines: 1, members: 3 */
};

So the entry has the theoretical max limit for the augmented payload
which would be 6 * sizeof(struct augmented_arg) + the common header for
tracepoints (unaugmented), a lot of space, but...

> +static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
> +{
> +	bool augmented, do_output = false;
> +	int zero = 0, size, aug_size, index, output = 0,
> +	    value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
> +	unsigned int nr, *beauty_map;
> +	struct beauty_payload_enter *payload;
> +	void *arg, *payload_offset;
> +
> +	/* fall back to do predefined tail call */
> +	if (args == NULL)
> +		return 1;
> +
> +	/* use syscall number to get beauty_map entry */
> +	nr             = (__u32)args->syscall_nr;
> +	beauty_map     = bpf_map_lookup_elem(&beauty_map_enter, &nr);

If we don't set up the entry for this syscall, then there are no
pointers to collect, we return early and the tracepoint will not filter
it and there it goes, up unmodified, if not already filtered by a
tracepoint filter, ok, I'll add these comments here.

> +	/* set up payload for output */
> +	payload        = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero);
> +	payload_offset = (void *)&payload->aug_args;
> > +	if (beauty_map == NULL || payload == NULL)
> +		return 1;

We can save the lookup for payload if the one for beauty_map returned
NULL, I'll do this fixup.

> +	/* copy the sys_enter header, which has the syscall_nr */
> +	__builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));


> +	/*
> +	 * Determine what type of argument and how many bytes to read from user space, using the
> +	 * value in the beauty_map. This is the relation of parameter type and its corresponding
> +	 * value in the beauty map, and how many bytes we read eventually:
> +	 *
> +	 * string: 1			      -> size of string
> +	 * struct: size of struct	      -> size of struct
> +	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)

'paired_len'? Ok, 1, size of string or struct


Will continue the detailed analysis later, as I need to change the
existing BPF collector, before applying this, to match the protocol here
(that will always have the size of each argument encoded in the ring
buffer, easier, but uses more space).

- Arnaldo

> +	 */
> +	for (int i = 0; i < 6; i++) {
> +		arg = (void *)args->args[i];
> +		augmented = false;
> +		size = beauty_map[i];
> +		aug_size = size; /* size of the augmented data read from user space */
> +
> +		if (size == 0 || arg == NULL)
> +			continue;
> +
> +		if (size == 1) { /* string */
> +			aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
> +			/* minimum of 0 to pass the verifier */
> +			if (aug_size < 0)
> +				aug_size = 0;
> +
> +			augmented = true;
> +		} else if (size > 0 && size <= value_size) { /* struct */
> +			if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
> +				augmented = true;
> +		} else if (size < 0 && size >= -6) { /* buffer */
> +			index = -(size + 1);
> +			aug_size = args->args[index];
> +
> +			if (aug_size > TRACE_AUG_MAX_BUF)
> +				aug_size = TRACE_AUG_MAX_BUF;
> +
> +			if (aug_size > 0) {
> +				if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
> +					augmented = true;
> +			}
> +		}
> +
> +		/* write data to payload */
> +		if (augmented) {
> +			int written = offsetof(struct augmented_arg, value) + aug_size;
> +
> +			((struct augmented_arg *)payload_offset)->size = aug_size;
> +			output += written;
> +			payload_offset += written;
> +			do_output = true;
> +		}
> +	}
> +
> +	if (!do_output)
> +		return 1;
> +
> +	return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);

We end up pushing up to the ring buffer all the way to 
Re: [PATCH v3 6/8] perf trace: Collect augmented data using BPF
Posted by Howard Chu 1 year, 3 months ago
Hello Arnaldo,

On Wed, Sep 4, 2024 at 12:53 PM Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
>
> On Sun, Aug 25, 2024 at 12:33:20AM +0800, Howard Chu wrote:
> > Include trace_augment.h for TRACE_AUG_MAX_BUF, so that BPF reads
> > TRACE_AUG_MAX_BUF bytes of buffer maximum.
> >
> > Determine what type of argument and how many bytes to read from user space, us ing the
> > value in the beauty_map. This is the relation of parameter type and its corres ponding
> > value in the beauty map, and how many bytes we read eventually:
> >
> > string: 1                          -> size of string (till null)
> > struct: size of struct             -> size of struct
> > buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_ MAX_BUF)
> >
> > After reading from user space, we output the augmented data using
> > bpf_perf_event_output().
> >
> > If the struct augmenter, augment_sys_enter() failed, we fall back to
> > using bpf_tail_call().
> >
> > I have to make the payload 6 times the size of augmented_arg, to pass the
> > BPF verifier.
> >
> > Committer notes:
> >
> > It works, but we need to wire it up to the userspace specialized pretty
> > printer, otherwise we get things like:
> >
> >   root@number:~# perf trace -e connect ssh localhost
> >        0.000 ( 0.010 ms): :784442/784442 connect(fd: 3, uservaddr: {2,}, addrlen: 16)                          = 0
> >        0.016 ( 0.006 ms): :784442/784442 connect(fd: 3, uservaddr: {10,}, addrlen: 28)                         = 0
> >        0.033 ( 0.096 ms): :784442/784442 connect(fd: 3, uservaddr: {10,}, addrlen: 28)                         = 0
> >   root@localhost's password:     71.292 ( 0.037 ms): ssh/784442 connect(fd: 4, uservaddr: {1,{['/','v','a','r','/','r','u','n','/','.','h','e','i','m',],},}, addrlen: 110) = 0
> >       72.087 ( 0.013 ms): ssh/784442 connect(fd: 4, uservaddr: {1,{['/','v','a','r','/','r','u','n','/','.','h','e','i','m',],},}, addrlen: 110) = 0
> >
> >   root@number:~#
> >
> > When we used to have:
> >
> >   root@number:~# perf trace -e connect ssh localhost
> >        0.000 ( 0.011 ms): ssh/786564 connect(fd: 3, uservaddr: { .family: INET, port: 22, addr: 127.0.0.1 }, addrlen: 16) = 0
> >        0.017 ( 0.006 ms): ssh/786564 connect(fd: 3, uservaddr: { .family: INET6, port: 22, addr: ::1 }, addrlen: 28) = 0
> >        0.263 ( 0.043 ms): ssh/786564 connect(fd: 3, uservaddr: { .family: INET6, port: 22, addr: ::1 }, addrlen: 28) = 0
> >       63.770 ( 0.044 ms): ssh/786564 connect(fd: 4, uservaddr: { .family: LOCAL, path: /var/run/.heim_org.h5l.kcm-socket }, addrlen: 110) = 0
> >       65.467 ( 0.042 ms): ssh/786564 connect(fd: 4, uservaddr: { .family: LOCAL, path: /var/run/.heim_org.h5l.kcm-socket }, addrlen: 110) = 0
> >   root@localhost's password:
> >
> > That is closer to what strace produces:
> >
> >   root@number:~# strace -e connect ssh localhost
> >   connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
> >   connect(3, {sa_family=AF_INET6, sin6_port=htons(22), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_scope_id=0}, 28) = 0
> >   connect(3, {sa_family=AF_INET6, sin6_port=htons(22), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_scope_id=0}, 28) = 0
> >   connect(4, {sa_family=AF_UNIX, sun_path="/var/run/.heim_org.h5l.kcm-socket"}, 110) = 0
> >   connect(4, {sa_family=AF_UNIX, sun_path="/var/run/.heim_org.h5l.kcm-socket"}, 110) = 0
> >   root@localhost's password:
> >
> > Signed-off-by: Howard Chu <howardchu95@gmail.com>
> > Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
> > Cc: Adrian Hunter <adrian.hunter@intel.com>
> > Cc: Ian Rogers <irogers@google.com>
> > Cc: Jiri Olsa <jolsa@kernel.org>
> > Cc: Kan Liang <kan.liang@linux.intel.com>
> > Cc: Namhyung Kim <namhyung@kernel.org>
> > Link: https://lore.kernel.org/r/20240815013626.935097-10-howardchu95@gmail.com
> > Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
> > ---
> >  .../bpf_skel/augmented_raw_syscalls.bpf.c     | 114 +++++++++++++++++-
> >  1 file changed, 113 insertions(+), 1 deletion(-)
> >
> > diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > index 0acbd74e8c76..f29a8dfca044 100644
> > --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > @@ -7,6 +7,8 @@
> >   */
> >
> >  #include "vmlinux.h"
> > +#include "../trace_augment.h"
> > +
> >  #include <bpf/bpf_helpers.h>
> >  #include <linux/limits.h>
> >
> > @@ -124,6 +126,25 @@ struct augmented_args_tmp {
> >       __uint(max_entries, 1);
> >  } augmented_args_tmp SEC(".maps");
> >
> > +struct beauty_payload_enter {
> > +     struct syscall_enter_args args;
> > +     struct augmented_arg aug_args[6];
> > +};
>
> ⬢[acme@toolbox perf-tools-next]$ pahole -C augmented_arg /tmp/build/perf-tools-next/util/bpf_skel/.tmp/augmented_raw_syscalls.bpf.o
> struct augmented_arg {
>         unsigned int               size;                 /*     0     4 */
>         int                        err;                  /*     4     4 */
>         char                       value[4096];          /*     8  4096 */
>
>         /* size: 4104, cachelines: 65, members: 3 */
>         /* last cacheline: 8 bytes */
> };
>
> ⬢[acme@toolbox perf-tools-next]$
>
> ⬢[acme@toolbox perf-tools-next]$ pahole -C syscall_enter_args /tmp/build/perf-tools-next/util/bpf_skel/.tmp/augmented_raw_syscalls.bpf.o
> struct syscall_enter_args {
>         unsigned long long         common_tp_fields;     /*     0     8 */
>         long                       syscall_nr;           /*     8     8 */
>         unsigned long              args[6];              /*    16    48 */
>
>         /* size: 64, cachelines: 1, members: 3 */
> };
>
> So the entry has the theoretical max limit for the augmented payload
> which would be 6 * sizeof(struct augmented_arg) + the common header for
> tracepoints (unaugmented), a lot of space, but...

Yes, if I don't use this much space, and try to do a flexible payload
length, I won't be able to pass the BPF verifier. I will try to fix
this wasting problem.

>
> > +static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
> > +{
> > +     bool augmented, do_output = false;
> > +     int zero = 0, size, aug_size, index, output = 0,
> > +         value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
> > +     unsigned int nr, *beauty_map;
> > +     struct beauty_payload_enter *payload;
> > +     void *arg, *payload_offset;
> > +
> > +     /* fall back to do predefined tail call */
> > +     if (args == NULL)
> > +             return 1;
> > +
> > +     /* use syscall number to get beauty_map entry */
> > +     nr             = (__u32)args->syscall_nr;
> > +     beauty_map     = bpf_map_lookup_elem(&beauty_map_enter, &nr);
>
> If we don't set up the entry for this syscall, then there are no
> pointers to collect, we return early and the tracepoint will not filter
> it and there it goes, up unmodified, if not already filtered by a
> tracepoint filter, ok, I'll add these comments here.

Thank you.

>
> > +     /* set up payload for output */
> > +     payload        = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero);
> > +     payload_offset = (void *)&payload->aug_args;
> > > +   if (beauty_map == NULL || payload == NULL)
> > +             return 1;
>
> We can save the lookup for payload if the one for beauty_map returned
> NULL, I'll do this fixup.

Thanks that makes sense.

>
> > +     /* copy the sys_enter header, which has the syscall_nr */
> > +     __builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
>
>
> > +     /*
> > +      * Determine what type of argument and how many bytes to read from user space, using the
> > +      * value in the beauty_map. This is the relation of parameter type and its corresponding
> > +      * value in the beauty map, and how many bytes we read eventually:
> > +      *
> > +      * string: 1                          -> size of string
> > +      * struct: size of struct             -> size of struct
> > +      * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
>
> 'paired_len'? Ok, 1, size of string or struct

I mean the buffer always comes with a size indicator. For instance
read syscall's 'buf' and 'count', in this case 'count' will pair with
'buf'. Sorry for the confusion. See below:

read: (unsigned int fd, char *buf, size_t count)

>
>
> Will continue the detailed analysis later, as I need to change the
> existing BPF collector, before applying this, to match the protocol here
> (that will always have the size of each argument encoded in the ring
> buffer, easier, but uses more space).

Thank you for adding the protocol data size fix.

>
> - Arnaldo
>
> > +      */
> > +     for (int i = 0; i < 6; i++) {
> > +             arg = (void *)args->args[i];
> > +             augmented = false;
> > +             size = beauty_map[i];
> > +             aug_size = size; /* size of the augmented data read from user space */
> > +
> > +             if (size == 0 || arg == NULL)
> > +                     continue;
> > +
> > +             if (size == 1) { /* string */
> > +                     aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
> > +                     /* minimum of 0 to pass the verifier */
> > +                     if (aug_size < 0)
> > +                             aug_size = 0;
> > +
> > +                     augmented = true;
> > +             } else if (size > 0 && size <= value_size) { /* struct */
> > +                     if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
> > +                             augmented = true;
> > +             } else if (size < 0 && size >= -6) { /* buffer */
> > +                     index = -(size + 1);
> > +                     aug_size = args->args[index];
> > +
> > +                     if (aug_size > TRACE_AUG_MAX_BUF)
> > +                             aug_size = TRACE_AUG_MAX_BUF;
> > +
> > +                     if (aug_size > 0) {
> > +                             if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
> > +                                     augmented = true;
> > +                     }
> > +             }
> > +
> > +             /* write data to payload */
> > +             if (augmented) {
> > +                     int written = offsetof(struct augmented_arg, value) + aug_size;
> > +
> > +                     ((struct augmented_arg *)payload_offset)->size = aug_size;
> > +                     output += written;
> > +                     payload_offset += written;
> > +                     do_output = true;
> > +             }
> > +     }
> > +
> > +     if (!do_output)
> > +             return 1;
> > +
> > +     return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);
>
> We end up pushing up to the ring buffer all the way to

Thanks,
Howard
Re: [PATCH v3 6/8] perf trace: Collect augmented data using BPF
Posted by Arnaldo Carvalho de Melo 1 year, 3 months ago
On Wed, Sep 04, 2024 at 02:11:35PM -0700, Howard Chu wrote:
> On Wed, Sep 4, 2024 at 12:53 PM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> > On Sun, Aug 25, 2024 at 12:33:20AM +0800, Howard Chu wrote:
> > ⬢[acme@toolbox perf-tools-next]$ pahole -C syscall_enter_args /tmp/build/perf-tools-next/util/bpf_skel/.tmp/augmented_raw_syscalls.bpf.o
> > struct syscall_enter_args {
> >         unsigned long long         common_tp_fields;     /*     0     8 */
> >         long                       syscall_nr;           /*     8     8 */
> >         unsigned long              args[6];              /*    16    48 */
> >
> >         /* size: 64, cachelines: 1, members: 3 */
> > };
> >
> > So the entry has the theoretical max limit for the augmented payload
> > which would be 6 * sizeof(struct augmented_arg) + the common header for
> > tracepoints (unaugmented), a lot of space, but...
> 
> Yes, if I don't use this much space, and try to do a flexible payload
> length, I won't be able to pass the BPF verifier. I will try to fix
> this wasting problem.

But then we don't use it, I need to look at the implementation of BPF
maps, to see how bad is this, but I doubt it is that much, well see.

- Arnaldo