[PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton

Ian Rogers posted 4 patches 2 years, 6 months ago
[PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton
Posted by Ian Rogers 2 years, 6 months ago
Previously a BPF event of augmented_raw_syscalls.c could be used to
enable augmentation of syscalls by perf trace. As BPF events are no
longer supported, switch to using a BPF skeleton which when attached
explicitly opens the sysenter and sysexit tracepoints.

The dump map is removed as debugging wasn't supported by the
augmentation and bpf_printk can be used when necessary.

Remove tools/perf/examples/bpf/augmented_raw_syscalls.c so that the
rename/migration to a BPF skeleton captures that this was the source.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 tools/perf/Makefile.perf                      |   1 +
 tools/perf/builtin-trace.c                    | 180 +++++++++++-------
 .../bpf_skel/augmented_raw_syscalls.bpf.c}    |  27 +--
 3 files changed, 131 insertions(+), 77 deletions(-)
 rename tools/perf/{examples/bpf/augmented_raw_syscalls.c => util/bpf_skel/augmented_raw_syscalls.bpf.c} (96%)

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 6ec5079fd697..0e1597712b95 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1042,6 +1042,7 @@ SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
 SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
 SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
 SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
+SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h
 
 $(SKEL_TMP_OUT) $(LIBAPI_OUTPUT) $(LIBBPF_OUTPUT) $(LIBPERF_OUTPUT) $(LIBSUBCMD_OUTPUT) $(LIBSYMBOL_OUTPUT):
 	$(Q)$(MKDIR) -p $@
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 59862467e781..8625fca42cd8 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -19,6 +19,9 @@
 #ifdef HAVE_LIBBPF_SUPPORT
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
+#ifdef HAVE_BPF_SKEL
+#include "bpf_skel/augmented_raw_syscalls.skel.h"
+#endif
 #endif
 #include "util/bpf_map.h"
 #include "util/rlimit.h"
@@ -127,25 +130,19 @@ struct trace {
 	struct syscalltbl	*sctbl;
 	struct {
 		struct syscall  *table;
-		struct { // per syscall BPF_MAP_TYPE_PROG_ARRAY
-			struct bpf_map  *sys_enter,
-					*sys_exit;
-		}		prog_array;
 		struct {
 			struct evsel *sys_enter,
-					  *sys_exit,
-					  *augmented;
+				*sys_exit,
+				*bpf_output;
 		}		events;
-		struct bpf_program *unaugmented_prog;
 	} syscalls;
-	struct {
-		struct bpf_map *map;
-	} dump;
+#ifdef HAVE_BPF_SKEL
+	struct augmented_raw_syscalls_bpf *skel;
+#endif
 	struct record_opts	opts;
 	struct evlist	*evlist;
 	struct machine		*host;
 	struct thread		*current;
-	struct bpf_object	*bpf_obj;
 	struct cgroup		*cgroup;
 	u64			base_time;
 	FILE			*output;
@@ -415,6 +412,7 @@ static int evsel__init_syscall_tp(struct evsel *evsel)
 		if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
 		    evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
 			return -ENOENT;
+
 		return 0;
 	}
 
@@ -2845,7 +2843,7 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
 	if (thread)
 		trace__fprintf_comm_tid(trace, thread, trace->output);
 
-	if (evsel == trace->syscalls.events.augmented) {
+	if (evsel == trace->syscalls.events.bpf_output) {
 		int id = perf_evsel__sc_tp_uint(evsel, id, sample);
 		struct syscall *sc = trace__syscall_info(trace, evsel, id);
 
@@ -3278,24 +3276,16 @@ static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
 	goto out;
 }
 
-#ifdef HAVE_LIBBPF_SUPPORT
-static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace, const char *name)
-{
-	if (trace->bpf_obj == NULL)
-		return NULL;
-
-	return bpf_object__find_map_by_name(trace->bpf_obj, name);
-}
-
+#ifdef HAVE_BPF_SKEL
 static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
 {
 	struct bpf_program *pos, *prog = NULL;
 	const char *sec_name;
 
-	if (trace->bpf_obj == NULL)
+	if (trace->skel->obj == NULL)
 		return NULL;
 
-	bpf_object__for_each_program(pos, trace->bpf_obj) {
+	bpf_object__for_each_program(pos, trace->skel->obj) {
 		sec_name = bpf_program__section_name(pos);
 		if (sec_name && !strcmp(sec_name, name)) {
 			prog = pos;
@@ -3313,12 +3303,14 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str
 
 	if (prog_name == NULL) {
 		char default_prog_name[256];
-		scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->name);
+		scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s",
+			  type, sc->name);
 		prog = trace__find_bpf_program_by_title(trace, default_prog_name);
 		if (prog != NULL)
 			goto out_found;
 		if (sc->fmt && sc->fmt->alias) {
-			scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->fmt->alias);
+			scnprintf(default_prog_name, sizeof(default_prog_name),
+				  "tp/syscalls/sys_%s_%s", type, sc->fmt->alias);
 			prog = trace__find_bpf_program_by_title(trace, default_prog_name);
 			if (prog != NULL)
 				goto out_found;
@@ -3336,7 +3328,7 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str
 	pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
 		 prog_name, type, sc->name);
 out_unaugmented:
-	return trace->syscalls.unaugmented_prog;
+	return trace->skel->progs.syscall_unaugmented;
 }
 
 static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
@@ -3353,13 +3345,21 @@ static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
 static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
 {
 	struct syscall *sc = trace__syscall_info(trace, NULL, id);
-	return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->syscalls.unaugmented_prog);
+
+	if (sc)
+		return bpf_program__fd(sc->bpf_prog.sys_enter);
+
+	return bpf_program__fd(trace->skel->progs.syscall_unaugmented);
 }
 
 static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
 {
 	struct syscall *sc = trace__syscall_info(trace, NULL, id);
-	return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->syscalls.unaugmented_prog);
+
+	if (sc)
+		return bpf_program__fd(sc->bpf_prog.sys_exit);
+
+	return bpf_program__fd(trace->skel->progs.syscall_unaugmented);
 }
 
 static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
@@ -3384,7 +3384,7 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
 		bool is_candidate = false;
 
 		if (pair == NULL || pair == sc ||
-		    pair->bpf_prog.sys_enter == trace->syscalls.unaugmented_prog)
+		    pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented)
 			continue;
 
 		for (field = sc->args, candidate_field = pair->args;
@@ -3437,7 +3437,7 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
 		 */
 		if (pair_prog == NULL) {
 			pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
-			if (pair_prog == trace->syscalls.unaugmented_prog)
+			if (pair_prog == trace->skel->progs.syscall_unaugmented)
 				goto next_candidate;
 		}
 
@@ -3452,8 +3452,8 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
 
 static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
 {
-	int map_enter_fd = bpf_map__fd(trace->syscalls.prog_array.sys_enter),
-	    map_exit_fd  = bpf_map__fd(trace->syscalls.prog_array.sys_exit);
+	int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter);
+	int map_exit_fd  = bpf_map__fd(trace->skel->maps.syscalls_sys_exit);
 	int err = 0, key;
 
 	for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
@@ -3515,7 +3515,7 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
 		 * For now we're just reusing the sys_enter prog, and if it
 		 * already has an augmenter, we don't need to find one.
 		 */
-		if (sc->bpf_prog.sys_enter != trace->syscalls.unaugmented_prog)
+		if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented)
 			continue;
 
 		/*
@@ -3538,22 +3538,9 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
 			break;
 	}
 
-
 	return err;
 }
-
-#else // HAVE_LIBBPF_SUPPORT
-static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace __maybe_unused,
-						   const char *name __maybe_unused)
-{
-	return NULL;
-}
-
-static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace __maybe_unused)
-{
-	return 0;
-}
-#endif // HAVE_LIBBPF_SUPPORT
+#endif // HAVE_BPF_SKEL
 
 static int trace__set_ev_qualifier_filter(struct trace *trace)
 {
@@ -3917,13 +3904,31 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	err = evlist__open(evlist);
 	if (err < 0)
 		goto out_error_open;
+#ifdef HAVE_BPF_SKEL
+	{
+		struct perf_cpu cpu;
 
+		/*
+		 * Set up the __augmented_syscalls__ BPF map to hold for each
+		 * CPU the bpf-output event's file descriptor.
+		 */
+		perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
+			bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
+					&cpu.cpu, sizeof(int),
+					xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
+						       cpu.cpu, 0),
+					sizeof(__u32), BPF_ANY);
+		}
+	}
+#endif
 	err = trace__set_filter_pids(trace);
 	if (err < 0)
 		goto out_error_mem;
 
-	if (trace->syscalls.prog_array.sys_enter)
+#ifdef HAVE_BPF_SKEL
+	if (trace->skel->progs.sys_enter)
 		trace__init_syscalls_bpf_prog_array_maps(trace);
+#endif
 
 	if (trace->ev_qualifier_ids.nr > 0) {
 		err = trace__set_ev_qualifier_filter(trace);
@@ -3956,9 +3961,6 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	if (err < 0)
 		goto out_error_apply_filters;
 
-	if (trace->dump.map)
-		bpf_map__fprintf(trace->dump.map, trace->output);
-
 	err = evlist__mmap(evlist, trace->opts.mmap_pages);
 	if (err < 0)
 		goto out_error_mmap;
@@ -4655,6 +4657,18 @@ static void trace__exit(struct trace *trace)
 	zfree(&trace->perfconfig_events);
 }
 
+#ifdef HAVE_BPF_SKEL
+static int bpf__setup_bpf_output(struct evlist *evlist)
+{
+	int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/");
+
+	if (err)
+		pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n");
+
+	return err;
+}
+#endif
+
 int cmd_trace(int argc, const char **argv)
 {
 	const char *trace_usage[] = {
@@ -4686,7 +4700,6 @@ int cmd_trace(int argc, const char **argv)
 		.max_stack = UINT_MAX,
 		.max_events = ULONG_MAX,
 	};
-	const char *map_dump_str = NULL;
 	const char *output_name = NULL;
 	const struct option trace_options[] = {
 	OPT_CALLBACK('e', "event", &trace, "event",
@@ -4720,9 +4733,6 @@ int cmd_trace(int argc, const char **argv)
 	OPT_CALLBACK(0, "duration", &trace, "float",
 		     "show only events with duration > N.M ms",
 		     trace__set_duration),
-#ifdef HAVE_LIBBPF_SUPPORT
-	OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"),
-#endif
 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
 	OPT_BOOLEAN('T', "time", &trace.full_time,
@@ -4849,16 +4859,55 @@ int cmd_trace(int argc, const char **argv)
 				       "cgroup monitoring only available in system-wide mode");
 	}
 
-	err = -1;
+#ifdef HAVE_BPF_SKEL
+	trace.skel = augmented_raw_syscalls_bpf__open();
+	if (!trace.skel) {
+		pr_debug("Failed to open augmented syscalls BPF skeleton");
+	} else {
+		/*
+		 * Disable attaching the BPF programs except for sys_enter and
+		 * sys_exit that tail call into this as necessary.
+		 */
+		bpf_program__set_autoattach(trace.skel->progs.syscall_unaugmented,
+					    /*autoattach=*/false);
+		bpf_program__set_autoattach(trace.skel->progs.sys_enter_connect,
+					    /*autoattach=*/false);
+		bpf_program__set_autoattach(trace.skel->progs.sys_enter_sendto,
+					    /*autoattach=*/false);
+		bpf_program__set_autoattach(trace.skel->progs.sys_enter_open,
+					    /*autoattach=*/false);
+		bpf_program__set_autoattach(trace.skel->progs.sys_enter_openat,
+					    /*autoattach=*/false);
+		bpf_program__set_autoattach(trace.skel->progs.sys_enter_rename,
+					    /*autoattach=*/false);
+		bpf_program__set_autoattach(trace.skel->progs.sys_enter_renameat,
+					    /*autoattach=*/false);
+		bpf_program__set_autoattach(trace.skel->progs.sys_enter_perf_event_open,
+					    /*autoattach=*/false);
+		bpf_program__set_autoattach(trace.skel->progs.sys_enter_clock_nanosleep,
+					    /*autoattach=*/false);
+
+		err = augmented_raw_syscalls_bpf__load(trace.skel);
 
-	if (map_dump_str) {
-		trace.dump.map = trace__find_bpf_map_by_name(&trace, map_dump_str);
-		if (trace.dump.map == NULL) {
-			pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str);
-			goto out;
+		if (err < 0) {
+			pr_debug("Failed to load augmented syscalls BPF skeleton\n");
+		} else {
+			augmented_raw_syscalls_bpf__attach(trace.skel);
+			trace__add_syscall_newtp(&trace);
 		}
 	}
 
+	err = bpf__setup_bpf_output(trace.evlist);
+	if (err) {
+		libbpf_strerror(err, bf, sizeof(bf));
+		pr_err("ERROR: Setup BPF output event failed: %s\n", bf);
+		goto out;
+	}
+	trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
+	assert(!strcmp(evsel__name(trace.syscalls.events.bpf_output), "__augmented_syscalls__"));
+#endif
+	err = -1;
+
 	if (trace.trace_pgfaults) {
 		trace.opts.sample_address = true;
 		trace.opts.sample_time = true;
@@ -4909,7 +4958,7 @@ int cmd_trace(int argc, const char **argv)
 	 * buffers that are being copied from kernel to userspace, think 'read'
 	 * syscall.
 	 */
-	if (trace.syscalls.events.augmented) {
+	if (trace.syscalls.events.bpf_output) {
 		evlist__for_each_entry(trace.evlist, evsel) {
 			bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
 
@@ -4918,9 +4967,9 @@ int cmd_trace(int argc, const char **argv)
 				goto init_augmented_syscall_tp;
 			}
 
-			if (trace.syscalls.events.augmented->priv == NULL &&
+			if (trace.syscalls.events.bpf_output->priv == NULL &&
 			    strstr(evsel__name(evsel), "syscalls:sys_enter")) {
-				struct evsel *augmented = trace.syscalls.events.augmented;
+				struct evsel *augmented = trace.syscalls.events.bpf_output;
 				if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
 				    evsel__init_augmented_syscall_tp_args(augmented))
 					goto out;
@@ -5025,5 +5074,8 @@ int cmd_trace(int argc, const char **argv)
 		fclose(trace.output);
 out:
 	trace__exit(&trace);
+#ifdef HAVE_BPF_SKEL
+	augmented_raw_syscalls_bpf__destroy(trace.skel);
+#endif
 	return err;
 }
diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
similarity index 96%
rename from tools/perf/examples/bpf/augmented_raw_syscalls.c
rename to tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 9a03189d33d3..70478b9460ee 100644
--- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -18,6 +18,8 @@
 #include <bpf/bpf_helpers.h>
 #include <linux/limits.h>
 
+#define MAX_CPUS  4096
+
 // FIXME: These should come from system headers
 typedef char bool;
 typedef int pid_t;
@@ -34,7 +36,7 @@ struct __augmented_syscalls__ {
 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
 	__type(key, int);
 	__type(value, __u32);
-	__uint(max_entries, __NR_CPUS__);
+	__uint(max_entries, MAX_CPUS);
 } __augmented_syscalls__ SEC(".maps");
 
 /*
@@ -170,7 +172,7 @@ unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const
 	return augmented_len;
 }
 
-SEC("!raw_syscalls:unaugmented")
+SEC("tp/raw_syscalls/sys_enter")
 int syscall_unaugmented(struct syscall_enter_args *args)
 {
 	return 1;
@@ -182,7 +184,7 @@ int syscall_unaugmented(struct syscall_enter_args *args)
  * on from there, reading the first syscall arg as a string, i.e. open's
  * filename.
  */
-SEC("!syscalls:sys_enter_connect")
+SEC("tp/syscalls/sys_enter_connect")
 int sys_enter_connect(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -201,7 +203,7 @@ int sys_enter_connect(struct syscall_enter_args *args)
 	return augmented__output(args, augmented_args, len + socklen);
 }
 
-SEC("!syscalls:sys_enter_sendto")
+SEC("tp/syscalls/sys_enter_sendto")
 int sys_enter_sendto(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -220,7 +222,7 @@ int sys_enter_sendto(struct syscall_enter_args *args)
 	return augmented__output(args, augmented_args, len + socklen);
 }
 
-SEC("!syscalls:sys_enter_open")
+SEC("tp/syscalls/sys_enter_open")
 int sys_enter_open(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -235,7 +237,7 @@ int sys_enter_open(struct syscall_enter_args *args)
 	return augmented__output(args, augmented_args, len);
 }
 
-SEC("!syscalls:sys_enter_openat")
+SEC("tp/syscalls/sys_enter_openat")
 int sys_enter_openat(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -250,7 +252,7 @@ int sys_enter_openat(struct syscall_enter_args *args)
 	return augmented__output(args, augmented_args, len);
 }
 
-SEC("!syscalls:sys_enter_rename")
+SEC("tp/syscalls/sys_enter_rename")
 int sys_enter_rename(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -267,7 +269,7 @@ int sys_enter_rename(struct syscall_enter_args *args)
 	return augmented__output(args, augmented_args, len);
 }
 
-SEC("!syscalls:sys_enter_renameat")
+SEC("tp/syscalls/sys_enter_renameat")
 int sys_enter_renameat(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -295,7 +297,7 @@ struct perf_event_attr_size {
         __u32                   size;
 };
 
-SEC("!syscalls:sys_enter_perf_event_open")
+SEC("tp/syscalls/sys_enter_perf_event_open")
 int sys_enter_perf_event_open(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -327,7 +329,7 @@ int sys_enter_perf_event_open(struct syscall_enter_args *args)
 	return 1; /* Failure: don't filter */
 }
 
-SEC("!syscalls:sys_enter_clock_nanosleep")
+SEC("tp/syscalls/sys_enter_clock_nanosleep")
 int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -358,7 +360,7 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
 	return bpf_map_lookup_elem(pids, &pid) != NULL;
 }
 
-SEC("raw_syscalls:sys_enter")
+SEC("tp/raw_syscalls/sys_enter")
 int sys_enter(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args;
@@ -371,7 +373,6 @@ int sys_enter(struct syscall_enter_args *args)
 	 * We'll add to this as we add augmented syscalls right after that
 	 * initial, non-augmented raw_syscalls:sys_enter payload.
 	 */
-	unsigned int len = sizeof(augmented_args->args);
 
 	if (pid_filter__has(&pids_filtered, getpid()))
 		return 0;
@@ -393,7 +394,7 @@ int sys_enter(struct syscall_enter_args *args)
 	return 0;
 }
 
-SEC("raw_syscalls:sys_exit")
+SEC("tp/raw_syscalls/sys_exit")
 int sys_exit(struct syscall_exit_args *args)
 {
 	struct syscall_exit_args exit_args;
-- 
2.41.0.640.ga95def55d0-goog
Re: [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton
Posted by Arnaldo Carvalho de Melo 2 years, 5 months ago
Em Thu, Aug 10, 2023 at 11:48:51AM -0700, Ian Rogers escreveu:
> Previously a BPF event of augmented_raw_syscalls.c could be used to
> enable augmentation of syscalls by perf trace. As BPF events are no
> longer supported, switch to using a BPF skeleton which when attached
> explicitly opens the sysenter and sysexit tracepoints.
> 
> The dump map is removed as debugging wasn't supported by the
> augmentation and bpf_printk can be used when necessary.
> 
> Remove tools/perf/examples/bpf/augmented_raw_syscalls.c so that the
> rename/migration to a BPF skeleton captures that this was the source.
> +#ifdef HAVE_BPF_SKEL
> +	trace.skel = augmented_raw_syscalls_bpf__open();
> +	if (!trace.skel) {
> +		pr_debug("Failed to open augmented syscalls BPF skeleton");
> +	} else {
> +		/*
> +		 * Disable attaching the BPF programs except for sys_enter and
> +		 * sys_exit that tail call into this as necessary.
> +		 */
> +		bpf_program__set_autoattach(trace.skel->progs.syscall_unaugmented,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_connect,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_sendto,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_open,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_openat,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_rename,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_renameat,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_perf_event_open,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_clock_nanosleep,
> +					    /*autoattach=*/false);
> +
> +		err = augmented_raw_syscalls_bpf__load(trace.skel);
>  

So I converted the above to:

		struct bpf_program *prog;

		bpf_object__for_each_program(prog, trace.skel->obj) {
			if (prog != trace.skel->progs.sys_enter && prog != trace.skel->progs.sys_exit)
				bpf_program__set_autoattach(prog, /*autoattach=*/false);
		}

So that we don't have to add new lines disabling attachment when adding
support for other pointer receiving syscalls.

- Arnaldo
Re: [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton
Posted by Arnaldo Carvalho de Melo 2 years, 6 months ago
Em Thu, Aug 10, 2023 at 11:48:51AM -0700, Ian Rogers escreveu:
> Previously a BPF event of augmented_raw_syscalls.c could be used to
> enable augmentation of syscalls by perf trace. As BPF events are no
> longer supported, switch to using a BPF skeleton which when attached
> explicitly opens the sysenter and sysexit tracepoints.
> 
> The dump map is removed as debugging wasn't supported by the
> augmentation and bpf_printk can be used when necessary.
> 
> Remove tools/perf/examples/bpf/augmented_raw_syscalls.c so that the
> rename/migration to a BPF skeleton captures that this was the source.

So, there is a problem where the augmented_raw_syscalls connect/sendto
handlers are being rejected by the verifier, the way you did it makes it
to print the verifier output and then continue without augmentation,
unsure if this is a good default, opinions?

[root@quaco ~]# perf trace -e open*
libbpf: prog 'sys_enter_connect': BPF program load failed: Permission denied
libbpf: prog 'sys_enter_connect': -- BEGIN PROG LOAD LOG --
reg type unsupported for arg#0 function sys_enter_connect#59
0: R1=ctx(off=0,imm=0) R10=fp0
; int sys_enter_connect(struct syscall_enter_args *args)
0: (bf) r6 = r1                       ; R1=ctx(off=0,imm=0) R6_w=ctx(off=0,imm=0)
1: (b7) r1 = 0                        ; R1_w=0
; int key = 0;
2: (63) *(u32 *)(r10 -4) = r1         ; R1_w=0 R10=fp0 fp-8=0000????
3: (bf) r2 = r10                      ; R2_w=fp0 R10=fp0
;
4: (07) r2 += -4                      ; R2_w=fp-4
; return bpf_map_lookup_elem(&augmented_args_tmp, &key);
5: (18) r1 = 0xffff8de5ae1d4600       ; R1_w=map_ptr(off=0,ks=4,vs=8272,imm=0)
7: (85) call bpf_map_lookup_elem#1    ; R0_w=map_value_or_null(id=1,off=0,ks=4,vs=8272,imm=0)
8: (bf) r7 = r0                       ; R0_w=map_value_or_null(id=1,off=0,ks=4,vs=8272,imm=0) R7_w=map_value_or_null(id=1,off=0,ks=4,vs=8272,imm=0)
9: (b7) r0 = 1                        ; R0_w=1
; if (augmented_args == NULL)
10: (15) if r7 == 0x0 goto pc+25      ; R7_w=map_value(off=0,ks=4,vs=8272,imm=0)
; unsigned int socklen = args->args[2];
11: (79) r1 = *(u64 *)(r6 +32)        ; R1_w=scalar() R6_w=ctx(off=0,imm=0)
;
12: (bf) r2 = r1                      ; R1_w=scalar(id=2) R2_w=scalar(id=2)
13: (67) r2 <<= 32                    ; R2_w=scalar(smax=9223372032559808512,umax=18446744069414584320,var_off=(0x0; 0xffffffff00000000),s32_min=0,s32_max=0,u32_max=0)
14: (77) r2 >>= 32                    ; R2_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff))
15: (b7) r8 = 128                     ; R8=128
; if (socklen > sizeof(augmented_args->saddr))
16: (25) if r2 > 0x80 goto pc+1       ; R2=scalar(umax=128,var_off=(0x0; 0xff))
17: (bf) r8 = r1                      ; R1=scalar(id=2) R8_w=scalar(id=2)
; const void *sockaddr_arg = (const void *)args->args[1];
18: (79) r3 = *(u64 *)(r6 +24)        ; R3_w=scalar() R6=ctx(off=0,imm=0)
; bpf_probe_read(&augmented_args->saddr, socklen, sockaddr_arg);
19: (bf) r1 = r7                      ; R1_w=map_value(off=0,ks=4,vs=8272,imm=0) R7=map_value(off=0,ks=4,vs=8272,imm=0)
20: (07) r1 += 64                     ; R1_w=map_value(off=64,ks=4,vs=8272,imm=0)
; bpf_probe_read(&augmented_args->saddr, socklen, sockaddr_arg);
21: (bf) r2 = r8                      ; R2_w=scalar(id=2) R8_w=scalar(id=2)
22: (85) call bpf_probe_read#4
R2 min value is negative, either use unsigned or 'var &= const'
processed 22 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1
-- END PROG LOAD LOG --
libbpf: prog 'sys_enter_connect': failed to load: -13
libbpf: failed to load object 'augmented_raw_syscalls_bpf'
libbpf: failed to load BPF skeleton 'augmented_raw_syscalls_bpf': -13
     0.000 systemd-oomd/959 openat(dfd: CWD, filename: 0xc0a2a2bd, flags: RDONLY|CLOEXEC) = 12
    86.339 thermald/1234 openat(dfd: CWD, filename: 0xac000ba0)  = 13
    87.008 thermald/1234 openat(dfd: CWD, filename: 0xac000eb0)  = 13
    87.270 thermald/1234 openat(dfd: CWD, filename: 0xac000b70)  = 13
    89.657 thermald/1234 openat(dfd: CWD, filename: 0xac000eb0)  = 13
^C

If I comment out the connect and sendto it doesn't build anymore,
whereas before it would continue with the other handlers:

  CLANG   /tmp/build/perf-tools-next/util/bpf_skel/.tmp/augmented_raw_syscalls.bpf.o
  GENSKEL /tmp/build/perf-tools-next/util/bpf_skel/augmented_raw_syscalls.skel.h
  CC      /tmp/build/perf-tools-next/builtin-trace.o
builtin-trace.c: In function ‘cmd_trace’:
builtin-trace.c:4873:63: error: ‘struct <anonymous>’ has no member named ‘sys_enter_connect’; did you mean ‘sys_enter_openat’?
 4873 |                 bpf_program__set_autoattach(trace.skel->progs.sys_enter_connect,
      |                                                               ^~~~~~~~~~~~~~~~~
      |                                                               sys_enter_openat
builtin-trace.c:4875:63: error: ‘struct <anonymous>’ has no member named ‘sys_enter_sendto’; did you mean ‘sys_enter_openat’?
 4875 |                 bpf_program__set_autoattach(trace.skel->progs.sys_enter_sendto,
      |                                                               ^~~~~~~~~~~~~~~~
      |                                                               sys_enter_openat
make[3]: *** [/home/acme/git/perf-tools-next/tools/build/Makefile.build:97: /tmp/build/perf-tools-next/builtin-trace.o] Error 1
make[2]: *** [Makefile.perf:662: /tmp/build/perf-tools-next/perf-in.o] Error 2
make[1]: *** [Makefile.perf:238: sub-make] Error 2
make: *** [Makefile:113: install-bin] Error 2
make: Leaving directory '/home/acme/git/perf-tools-next/tools/perf'
[acme@quaco perf-tools-next]$



I.e. no need for explicitely referring to those, I think in the past it
was just looking if it was there and if so, attaching, I'll try to fix
this.

If I remove the explicit references in builtin-trace.c:

[root@quaco ~]# perf trace -e open* --max-events=10
     0.000 thermald/1234 openat(dfd: CWD, filename: "/sys/class/powercap/intel-rapl/intel-rapl:0/intel-rapl:0:2/energy_uj") = 13
     0.236 thermald/1234 openat(dfd: CWD, filename: "/sys/class/powercap/intel-rapl/intel-rapl:0/energy_uj") = 13
     0.334 thermald/1234 openat(dfd: CWD, filename: "/sys/class/thermal/thermal_zone2/temp") = 13
     9.092 systemd-oomd/959 openat(dfd: CWD, filename: "/proc/meminfo", flags: RDONLY|CLOEXEC) = 12
   259.212 systemd-oomd/959 openat(dfd: CWD, filename: "/proc/meminfo", flags: RDONLY|CLOEXEC) = 12
   497.464 gpm/1049 openat(dfd: CWD, filename: "/dev/tty0") = 4
   509.044 systemd-oomd/959 openat(dfd: CWD, filename: "/proc/meminfo", flags: RDONLY|CLOEXEC) = 12
   509.559 systemd-oomd/959 openat(dfd: CWD, filename: "/sys/fs/cgroup/user.slice/user-1000.slice/user@1000.service/session.slice/memory.pressure", flags: RDONLY|CLOEXEC) = 12
   509.917 systemd-oomd/959 openat(dfd: CWD, filename: "/sys/fs/cgroup/user.slice/user-1000.slice/user@1000.service/session.slice/memory.current", flags: RDONLY|CLOEXEC) = 12
   510.111 systemd-oomd/959 openat(dfd: CWD, filename: "/sys/fs/cgroup/user.slice/user-1000.slice/user@1000.service/session.slice/memory.min", flags: RDONLY|CLOEXEC) = 12
[root@quaco ~]#

Cool!

Some inception:

[root@quaco ~]# perf trace -e perf_event_open perf stat -e cycles,instructions,cache-misses sleep 1
     0.000 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0 (PERF_COUNT_HW_CPU_CYCLES), sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 232297 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 3
     0.063 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x1 (PERF_COUNT_HW_INSTRUCTIONS), sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 232297 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 4
     0.070 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x3 (PERF_COUNT_HW_CACHE_MISSES), sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 232297 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 5

 Performance counter stats for 'sleep 1':

         2,669,464      cycles
         1,842,319      instructions                     #    0.69  insn per cycle
            27,716      cache-misses

       1.001948592 seconds time elapsed

       0.000000000 seconds user
       0.001657000 seconds sys


[root@quaco ~]#

I'm putting what I have in the tmp.perf-tools-next branch, will continue
later today.

- Arnaldo
Re: [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton
Posted by Jiri Olsa 2 years, 6 months ago
On Thu, Aug 10, 2023 at 11:48:51AM -0700, Ian Rogers wrote:
> Previously a BPF event of augmented_raw_syscalls.c could be used to
> enable augmentation of syscalls by perf trace. As BPF events are no
> longer supported, switch to using a BPF skeleton which when attached
> explicitly opens the sysenter and sysexit tracepoints.
> 
> The dump map is removed as debugging wasn't supported by the
> augmentation and bpf_printk can be used when necessary.
> 
> Remove tools/perf/examples/bpf/augmented_raw_syscalls.c so that the
> rename/migration to a BPF skeleton captures that this was the source.

there's still some:

[jolsa@krava perf]$ grep -r augmented_raw_syscalls.c 
builtin-trace.c:         * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
builtin-trace.c:                                 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
Documentation/perf-trace.txt:   living in tools/perf/examples/bpf/augmented_raw_syscalls.c. For now this

jirka

> 
> Signed-off-by: Ian Rogers <irogers@google.com>
> ---
>  tools/perf/Makefile.perf                      |   1 +
>  tools/perf/builtin-trace.c                    | 180 +++++++++++-------
>  .../bpf_skel/augmented_raw_syscalls.bpf.c}    |  27 +--
>  3 files changed, 131 insertions(+), 77 deletions(-)
>  rename tools/perf/{examples/bpf/augmented_raw_syscalls.c => util/bpf_skel/augmented_raw_syscalls.bpf.c} (96%)
> 
> diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
> index 6ec5079fd697..0e1597712b95 100644
> --- a/tools/perf/Makefile.perf
> +++ b/tools/perf/Makefile.perf
> @@ -1042,6 +1042,7 @@ SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
>  SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
>  SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
>  SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
> +SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h
>  
>  $(SKEL_TMP_OUT) $(LIBAPI_OUTPUT) $(LIBBPF_OUTPUT) $(LIBPERF_OUTPUT) $(LIBSUBCMD_OUTPUT) $(LIBSYMBOL_OUTPUT):
>  	$(Q)$(MKDIR) -p $@
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index 59862467e781..8625fca42cd8 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -19,6 +19,9 @@
>  #ifdef HAVE_LIBBPF_SUPPORT
>  #include <bpf/bpf.h>
>  #include <bpf/libbpf.h>
> +#ifdef HAVE_BPF_SKEL
> +#include "bpf_skel/augmented_raw_syscalls.skel.h"
> +#endif
>  #endif
>  #include "util/bpf_map.h"
>  #include "util/rlimit.h"
> @@ -127,25 +130,19 @@ struct trace {
>  	struct syscalltbl	*sctbl;
>  	struct {
>  		struct syscall  *table;
> -		struct { // per syscall BPF_MAP_TYPE_PROG_ARRAY
> -			struct bpf_map  *sys_enter,
> -					*sys_exit;
> -		}		prog_array;
>  		struct {
>  			struct evsel *sys_enter,
> -					  *sys_exit,
> -					  *augmented;
> +				*sys_exit,
> +				*bpf_output;
>  		}		events;
> -		struct bpf_program *unaugmented_prog;
>  	} syscalls;
> -	struct {
> -		struct bpf_map *map;
> -	} dump;
> +#ifdef HAVE_BPF_SKEL
> +	struct augmented_raw_syscalls_bpf *skel;
> +#endif
>  	struct record_opts	opts;
>  	struct evlist	*evlist;
>  	struct machine		*host;
>  	struct thread		*current;
> -	struct bpf_object	*bpf_obj;
>  	struct cgroup		*cgroup;
>  	u64			base_time;
>  	FILE			*output;
> @@ -415,6 +412,7 @@ static int evsel__init_syscall_tp(struct evsel *evsel)
>  		if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
>  		    evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
>  			return -ENOENT;
> +
>  		return 0;
>  	}
>  
> @@ -2845,7 +2843,7 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
>  	if (thread)
>  		trace__fprintf_comm_tid(trace, thread, trace->output);
>  
> -	if (evsel == trace->syscalls.events.augmented) {
> +	if (evsel == trace->syscalls.events.bpf_output) {
>  		int id = perf_evsel__sc_tp_uint(evsel, id, sample);
>  		struct syscall *sc = trace__syscall_info(trace, evsel, id);
>  
> @@ -3278,24 +3276,16 @@ static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
>  	goto out;
>  }
>  
> -#ifdef HAVE_LIBBPF_SUPPORT
> -static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace, const char *name)
> -{
> -	if (trace->bpf_obj == NULL)
> -		return NULL;
> -
> -	return bpf_object__find_map_by_name(trace->bpf_obj, name);
> -}
> -
> +#ifdef HAVE_BPF_SKEL
>  static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
>  {
>  	struct bpf_program *pos, *prog = NULL;
>  	const char *sec_name;
>  
> -	if (trace->bpf_obj == NULL)
> +	if (trace->skel->obj == NULL)
>  		return NULL;
>  
> -	bpf_object__for_each_program(pos, trace->bpf_obj) {
> +	bpf_object__for_each_program(pos, trace->skel->obj) {
>  		sec_name = bpf_program__section_name(pos);
>  		if (sec_name && !strcmp(sec_name, name)) {
>  			prog = pos;
> @@ -3313,12 +3303,14 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str
>  
>  	if (prog_name == NULL) {
>  		char default_prog_name[256];
> -		scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->name);
> +		scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s",
> +			  type, sc->name);
>  		prog = trace__find_bpf_program_by_title(trace, default_prog_name);
>  		if (prog != NULL)
>  			goto out_found;
>  		if (sc->fmt && sc->fmt->alias) {
> -			scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->fmt->alias);
> +			scnprintf(default_prog_name, sizeof(default_prog_name),
> +				  "tp/syscalls/sys_%s_%s", type, sc->fmt->alias);
>  			prog = trace__find_bpf_program_by_title(trace, default_prog_name);
>  			if (prog != NULL)
>  				goto out_found;
> @@ -3336,7 +3328,7 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str
>  	pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
>  		 prog_name, type, sc->name);
>  out_unaugmented:
> -	return trace->syscalls.unaugmented_prog;
> +	return trace->skel->progs.syscall_unaugmented;
>  }
>  
>  static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
> @@ -3353,13 +3345,21 @@ static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
>  static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
>  {
>  	struct syscall *sc = trace__syscall_info(trace, NULL, id);
> -	return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->syscalls.unaugmented_prog);
> +
> +	if (sc)
> +		return bpf_program__fd(sc->bpf_prog.sys_enter);
> +
> +	return bpf_program__fd(trace->skel->progs.syscall_unaugmented);
>  }
>  
>  static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
>  {
>  	struct syscall *sc = trace__syscall_info(trace, NULL, id);
> -	return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->syscalls.unaugmented_prog);
> +
> +	if (sc)
> +		return bpf_program__fd(sc->bpf_prog.sys_exit);
> +
> +	return bpf_program__fd(trace->skel->progs.syscall_unaugmented);
>  }
>  
>  static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
> @@ -3384,7 +3384,7 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
>  		bool is_candidate = false;
>  
>  		if (pair == NULL || pair == sc ||
> -		    pair->bpf_prog.sys_enter == trace->syscalls.unaugmented_prog)
> +		    pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented)
>  			continue;
>  
>  		for (field = sc->args, candidate_field = pair->args;
> @@ -3437,7 +3437,7 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
>  		 */
>  		if (pair_prog == NULL) {
>  			pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
> -			if (pair_prog == trace->syscalls.unaugmented_prog)
> +			if (pair_prog == trace->skel->progs.syscall_unaugmented)
>  				goto next_candidate;
>  		}
>  
> @@ -3452,8 +3452,8 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
>  
>  static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
>  {
> -	int map_enter_fd = bpf_map__fd(trace->syscalls.prog_array.sys_enter),
> -	    map_exit_fd  = bpf_map__fd(trace->syscalls.prog_array.sys_exit);
> +	int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter);
> +	int map_exit_fd  = bpf_map__fd(trace->skel->maps.syscalls_sys_exit);
>  	int err = 0, key;
>  
>  	for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
> @@ -3515,7 +3515,7 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
>  		 * For now we're just reusing the sys_enter prog, and if it
>  		 * already has an augmenter, we don't need to find one.
>  		 */
> -		if (sc->bpf_prog.sys_enter != trace->syscalls.unaugmented_prog)
> +		if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented)
>  			continue;
>  
>  		/*
> @@ -3538,22 +3538,9 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
>  			break;
>  	}
>  
> -
>  	return err;
>  }
> -
> -#else // HAVE_LIBBPF_SUPPORT
> -static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace __maybe_unused,
> -						   const char *name __maybe_unused)
> -{
> -	return NULL;
> -}
> -
> -static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace __maybe_unused)
> -{
> -	return 0;
> -}
> -#endif // HAVE_LIBBPF_SUPPORT
> +#endif // HAVE_BPF_SKEL
>  
>  static int trace__set_ev_qualifier_filter(struct trace *trace)
>  {
> @@ -3917,13 +3904,31 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>  	err = evlist__open(evlist);
>  	if (err < 0)
>  		goto out_error_open;
> +#ifdef HAVE_BPF_SKEL
> +	{
> +		struct perf_cpu cpu;
>  
> +		/*
> +		 * Set up the __augmented_syscalls__ BPF map to hold for each
> +		 * CPU the bpf-output event's file descriptor.
> +		 */
> +		perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
> +			bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
> +					&cpu.cpu, sizeof(int),
> +					xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
> +						       cpu.cpu, 0),
> +					sizeof(__u32), BPF_ANY);
> +		}
> +	}
> +#endif
>  	err = trace__set_filter_pids(trace);
>  	if (err < 0)
>  		goto out_error_mem;
>  
> -	if (trace->syscalls.prog_array.sys_enter)
> +#ifdef HAVE_BPF_SKEL
> +	if (trace->skel->progs.sys_enter)
>  		trace__init_syscalls_bpf_prog_array_maps(trace);
> +#endif
>  
>  	if (trace->ev_qualifier_ids.nr > 0) {
>  		err = trace__set_ev_qualifier_filter(trace);
> @@ -3956,9 +3961,6 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>  	if (err < 0)
>  		goto out_error_apply_filters;
>  
> -	if (trace->dump.map)
> -		bpf_map__fprintf(trace->dump.map, trace->output);
> -
>  	err = evlist__mmap(evlist, trace->opts.mmap_pages);
>  	if (err < 0)
>  		goto out_error_mmap;
> @@ -4655,6 +4657,18 @@ static void trace__exit(struct trace *trace)
>  	zfree(&trace->perfconfig_events);
>  }
>  
> +#ifdef HAVE_BPF_SKEL
> +static int bpf__setup_bpf_output(struct evlist *evlist)
> +{
> +	int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/");
> +
> +	if (err)
> +		pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n");
> +
> +	return err;
> +}
> +#endif
> +
>  int cmd_trace(int argc, const char **argv)
>  {
>  	const char *trace_usage[] = {
> @@ -4686,7 +4700,6 @@ int cmd_trace(int argc, const char **argv)
>  		.max_stack = UINT_MAX,
>  		.max_events = ULONG_MAX,
>  	};
> -	const char *map_dump_str = NULL;
>  	const char *output_name = NULL;
>  	const struct option trace_options[] = {
>  	OPT_CALLBACK('e', "event", &trace, "event",
> @@ -4720,9 +4733,6 @@ int cmd_trace(int argc, const char **argv)
>  	OPT_CALLBACK(0, "duration", &trace, "float",
>  		     "show only events with duration > N.M ms",
>  		     trace__set_duration),
> -#ifdef HAVE_LIBBPF_SUPPORT
> -	OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"),
> -#endif
>  	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
>  	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
>  	OPT_BOOLEAN('T', "time", &trace.full_time,
> @@ -4849,16 +4859,55 @@ int cmd_trace(int argc, const char **argv)
>  				       "cgroup monitoring only available in system-wide mode");
>  	}
>  
> -	err = -1;
> +#ifdef HAVE_BPF_SKEL
> +	trace.skel = augmented_raw_syscalls_bpf__open();
> +	if (!trace.skel) {
> +		pr_debug("Failed to open augmented syscalls BPF skeleton");
> +	} else {
> +		/*
> +		 * Disable attaching the BPF programs except for sys_enter and
> +		 * sys_exit that tail call into this as necessary.
> +		 */
> +		bpf_program__set_autoattach(trace.skel->progs.syscall_unaugmented,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_connect,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_sendto,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_open,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_openat,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_rename,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_renameat,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_perf_event_open,
> +					    /*autoattach=*/false);
> +		bpf_program__set_autoattach(trace.skel->progs.sys_enter_clock_nanosleep,
> +					    /*autoattach=*/false);
> +
> +		err = augmented_raw_syscalls_bpf__load(trace.skel);
>  
> -	if (map_dump_str) {
> -		trace.dump.map = trace__find_bpf_map_by_name(&trace, map_dump_str);
> -		if (trace.dump.map == NULL) {
> -			pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str);
> -			goto out;
> +		if (err < 0) {
> +			pr_debug("Failed to load augmented syscalls BPF skeleton\n");
> +		} else {
> +			augmented_raw_syscalls_bpf__attach(trace.skel);
> +			trace__add_syscall_newtp(&trace);
>  		}
>  	}
>  
> +	err = bpf__setup_bpf_output(trace.evlist);
> +	if (err) {
> +		libbpf_strerror(err, bf, sizeof(bf));
> +		pr_err("ERROR: Setup BPF output event failed: %s\n", bf);
> +		goto out;
> +	}
> +	trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
> +	assert(!strcmp(evsel__name(trace.syscalls.events.bpf_output), "__augmented_syscalls__"));
> +#endif
> +	err = -1;
> +
>  	if (trace.trace_pgfaults) {
>  		trace.opts.sample_address = true;
>  		trace.opts.sample_time = true;
> @@ -4909,7 +4958,7 @@ int cmd_trace(int argc, const char **argv)
>  	 * buffers that are being copied from kernel to userspace, think 'read'
>  	 * syscall.
>  	 */
> -	if (trace.syscalls.events.augmented) {
> +	if (trace.syscalls.events.bpf_output) {
>  		evlist__for_each_entry(trace.evlist, evsel) {
>  			bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
>  
> @@ -4918,9 +4967,9 @@ int cmd_trace(int argc, const char **argv)
>  				goto init_augmented_syscall_tp;
>  			}
>  
> -			if (trace.syscalls.events.augmented->priv == NULL &&
> +			if (trace.syscalls.events.bpf_output->priv == NULL &&
>  			    strstr(evsel__name(evsel), "syscalls:sys_enter")) {
> -				struct evsel *augmented = trace.syscalls.events.augmented;
> +				struct evsel *augmented = trace.syscalls.events.bpf_output;
>  				if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
>  				    evsel__init_augmented_syscall_tp_args(augmented))
>  					goto out;
> @@ -5025,5 +5074,8 @@ int cmd_trace(int argc, const char **argv)
>  		fclose(trace.output);
>  out:
>  	trace__exit(&trace);
> +#ifdef HAVE_BPF_SKEL
> +	augmented_raw_syscalls_bpf__destroy(trace.skel);
> +#endif
>  	return err;
>  }
> diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> similarity index 96%
> rename from tools/perf/examples/bpf/augmented_raw_syscalls.c
> rename to tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> index 9a03189d33d3..70478b9460ee 100644
> --- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
> +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> @@ -18,6 +18,8 @@
>  #include <bpf/bpf_helpers.h>
>  #include <linux/limits.h>
>  
> +#define MAX_CPUS  4096
> +
>  // FIXME: These should come from system headers
>  typedef char bool;
>  typedef int pid_t;
> @@ -34,7 +36,7 @@ struct __augmented_syscalls__ {
>  	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
>  	__type(key, int);
>  	__type(value, __u32);
> -	__uint(max_entries, __NR_CPUS__);
> +	__uint(max_entries, MAX_CPUS);
>  } __augmented_syscalls__ SEC(".maps");
>  
>  /*
> @@ -170,7 +172,7 @@ unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const
>  	return augmented_len;
>  }
>  
> -SEC("!raw_syscalls:unaugmented")
> +SEC("tp/raw_syscalls/sys_enter")
>  int syscall_unaugmented(struct syscall_enter_args *args)
>  {
>  	return 1;
> @@ -182,7 +184,7 @@ int syscall_unaugmented(struct syscall_enter_args *args)
>   * on from there, reading the first syscall arg as a string, i.e. open's
>   * filename.
>   */
> -SEC("!syscalls:sys_enter_connect")
> +SEC("tp/syscalls/sys_enter_connect")
>  int sys_enter_connect(struct syscall_enter_args *args)
>  {
>  	struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -201,7 +203,7 @@ int sys_enter_connect(struct syscall_enter_args *args)
>  	return augmented__output(args, augmented_args, len + socklen);
>  }
>  
> -SEC("!syscalls:sys_enter_sendto")
> +SEC("tp/syscalls/sys_enter_sendto")
>  int sys_enter_sendto(struct syscall_enter_args *args)
>  {
>  	struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -220,7 +222,7 @@ int sys_enter_sendto(struct syscall_enter_args *args)
>  	return augmented__output(args, augmented_args, len + socklen);
>  }
>  
> -SEC("!syscalls:sys_enter_open")
> +SEC("tp/syscalls/sys_enter_open")
>  int sys_enter_open(struct syscall_enter_args *args)
>  {
>  	struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -235,7 +237,7 @@ int sys_enter_open(struct syscall_enter_args *args)
>  	return augmented__output(args, augmented_args, len);
>  }
>  
> -SEC("!syscalls:sys_enter_openat")
> +SEC("tp/syscalls/sys_enter_openat")
>  int sys_enter_openat(struct syscall_enter_args *args)
>  {
>  	struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -250,7 +252,7 @@ int sys_enter_openat(struct syscall_enter_args *args)
>  	return augmented__output(args, augmented_args, len);
>  }
>  
> -SEC("!syscalls:sys_enter_rename")
> +SEC("tp/syscalls/sys_enter_rename")
>  int sys_enter_rename(struct syscall_enter_args *args)
>  {
>  	struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -267,7 +269,7 @@ int sys_enter_rename(struct syscall_enter_args *args)
>  	return augmented__output(args, augmented_args, len);
>  }
>  
> -SEC("!syscalls:sys_enter_renameat")
> +SEC("tp/syscalls/sys_enter_renameat")
>  int sys_enter_renameat(struct syscall_enter_args *args)
>  {
>  	struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -295,7 +297,7 @@ struct perf_event_attr_size {
>          __u32                   size;
>  };
>  
> -SEC("!syscalls:sys_enter_perf_event_open")
> +SEC("tp/syscalls/sys_enter_perf_event_open")
>  int sys_enter_perf_event_open(struct syscall_enter_args *args)
>  {
>  	struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -327,7 +329,7 @@ int sys_enter_perf_event_open(struct syscall_enter_args *args)
>  	return 1; /* Failure: don't filter */
>  }
>  
> -SEC("!syscalls:sys_enter_clock_nanosleep")
> +SEC("tp/syscalls/sys_enter_clock_nanosleep")
>  int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
>  {
>  	struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -358,7 +360,7 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
>  	return bpf_map_lookup_elem(pids, &pid) != NULL;
>  }
>  
> -SEC("raw_syscalls:sys_enter")
> +SEC("tp/raw_syscalls/sys_enter")
>  int sys_enter(struct syscall_enter_args *args)
>  {
>  	struct augmented_args_payload *augmented_args;
> @@ -371,7 +373,6 @@ int sys_enter(struct syscall_enter_args *args)
>  	 * We'll add to this as we add augmented syscalls right after that
>  	 * initial, non-augmented raw_syscalls:sys_enter payload.
>  	 */
> -	unsigned int len = sizeof(augmented_args->args);
>  
>  	if (pid_filter__has(&pids_filtered, getpid()))
>  		return 0;
> @@ -393,7 +394,7 @@ int sys_enter(struct syscall_enter_args *args)
>  	return 0;
>  }
>  
> -SEC("raw_syscalls:sys_exit")
> +SEC("tp/raw_syscalls/sys_exit")
>  int sys_exit(struct syscall_exit_args *args)
>  {
>  	struct syscall_exit_args exit_args;
> -- 
> 2.41.0.640.ga95def55d0-goog
>
Re: [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton
Posted by Ian Rogers 2 years, 6 months ago
On Fri, Aug 11, 2023 at 9:09 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Thu, Aug 10, 2023 at 11:48:51AM -0700, Ian Rogers wrote:
> > Previously a BPF event of augmented_raw_syscalls.c could be used to
> > enable augmentation of syscalls by perf trace. As BPF events are no
> > longer supported, switch to using a BPF skeleton which when attached
> > explicitly opens the sysenter and sysexit tracepoints.
> >
> > The dump map is removed as debugging wasn't supported by the
> > augmentation and bpf_printk can be used when necessary.
> >
> > Remove tools/perf/examples/bpf/augmented_raw_syscalls.c so that the
> > rename/migration to a BPF skeleton captures that this was the source.
>
> there's still some:
>
> [jolsa@krava perf]$ grep -r augmented_raw_syscalls.c
> builtin-trace.c:         * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
> builtin-trace.c:                                 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
> Documentation/perf-trace.txt:   living in tools/perf/examples/bpf/augmented_raw_syscalls.c. For now this

Agreed, I'll double check but the later patches remove these. I was
trying to keep this patch down to a minimum one approach switch to the
other.

Thanks,
Ian

> jirka
>
> >
> > Signed-off-by: Ian Rogers <irogers@google.com>
> > ---
> >  tools/perf/Makefile.perf                      |   1 +
> >  tools/perf/builtin-trace.c                    | 180 +++++++++++-------
> >  .../bpf_skel/augmented_raw_syscalls.bpf.c}    |  27 +--
> >  3 files changed, 131 insertions(+), 77 deletions(-)
> >  rename tools/perf/{examples/bpf/augmented_raw_syscalls.c => util/bpf_skel/augmented_raw_syscalls.bpf.c} (96%)
> >
> > diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
> > index 6ec5079fd697..0e1597712b95 100644
> > --- a/tools/perf/Makefile.perf
> > +++ b/tools/perf/Makefile.perf
> > @@ -1042,6 +1042,7 @@ SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
> >  SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
> >  SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
> >  SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
> > +SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h
> >
> >  $(SKEL_TMP_OUT) $(LIBAPI_OUTPUT) $(LIBBPF_OUTPUT) $(LIBPERF_OUTPUT) $(LIBSUBCMD_OUTPUT) $(LIBSYMBOL_OUTPUT):
> >       $(Q)$(MKDIR) -p $@
> > diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> > index 59862467e781..8625fca42cd8 100644
> > --- a/tools/perf/builtin-trace.c
> > +++ b/tools/perf/builtin-trace.c
> > @@ -19,6 +19,9 @@
> >  #ifdef HAVE_LIBBPF_SUPPORT
> >  #include <bpf/bpf.h>
> >  #include <bpf/libbpf.h>
> > +#ifdef HAVE_BPF_SKEL
> > +#include "bpf_skel/augmented_raw_syscalls.skel.h"
> > +#endif
> >  #endif
> >  #include "util/bpf_map.h"
> >  #include "util/rlimit.h"
> > @@ -127,25 +130,19 @@ struct trace {
> >       struct syscalltbl       *sctbl;
> >       struct {
> >               struct syscall  *table;
> > -             struct { // per syscall BPF_MAP_TYPE_PROG_ARRAY
> > -                     struct bpf_map  *sys_enter,
> > -                                     *sys_exit;
> > -             }               prog_array;
> >               struct {
> >                       struct evsel *sys_enter,
> > -                                       *sys_exit,
> > -                                       *augmented;
> > +                             *sys_exit,
> > +                             *bpf_output;
> >               }               events;
> > -             struct bpf_program *unaugmented_prog;
> >       } syscalls;
> > -     struct {
> > -             struct bpf_map *map;
> > -     } dump;
> > +#ifdef HAVE_BPF_SKEL
> > +     struct augmented_raw_syscalls_bpf *skel;
> > +#endif
> >       struct record_opts      opts;
> >       struct evlist   *evlist;
> >       struct machine          *host;
> >       struct thread           *current;
> > -     struct bpf_object       *bpf_obj;
> >       struct cgroup           *cgroup;
> >       u64                     base_time;
> >       FILE                    *output;
> > @@ -415,6 +412,7 @@ static int evsel__init_syscall_tp(struct evsel *evsel)
> >               if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
> >                   evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
> >                       return -ENOENT;
> > +
> >               return 0;
> >       }
> >
> > @@ -2845,7 +2843,7 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
> >       if (thread)
> >               trace__fprintf_comm_tid(trace, thread, trace->output);
> >
> > -     if (evsel == trace->syscalls.events.augmented) {
> > +     if (evsel == trace->syscalls.events.bpf_output) {
> >               int id = perf_evsel__sc_tp_uint(evsel, id, sample);
> >               struct syscall *sc = trace__syscall_info(trace, evsel, id);
> >
> > @@ -3278,24 +3276,16 @@ static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
> >       goto out;
> >  }
> >
> > -#ifdef HAVE_LIBBPF_SUPPORT
> > -static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace, const char *name)
> > -{
> > -     if (trace->bpf_obj == NULL)
> > -             return NULL;
> > -
> > -     return bpf_object__find_map_by_name(trace->bpf_obj, name);
> > -}
> > -
> > +#ifdef HAVE_BPF_SKEL
> >  static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
> >  {
> >       struct bpf_program *pos, *prog = NULL;
> >       const char *sec_name;
> >
> > -     if (trace->bpf_obj == NULL)
> > +     if (trace->skel->obj == NULL)
> >               return NULL;
> >
> > -     bpf_object__for_each_program(pos, trace->bpf_obj) {
> > +     bpf_object__for_each_program(pos, trace->skel->obj) {
> >               sec_name = bpf_program__section_name(pos);
> >               if (sec_name && !strcmp(sec_name, name)) {
> >                       prog = pos;
> > @@ -3313,12 +3303,14 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str
> >
> >       if (prog_name == NULL) {
> >               char default_prog_name[256];
> > -             scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->name);
> > +             scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s",
> > +                       type, sc->name);
> >               prog = trace__find_bpf_program_by_title(trace, default_prog_name);
> >               if (prog != NULL)
> >                       goto out_found;
> >               if (sc->fmt && sc->fmt->alias) {
> > -                     scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->fmt->alias);
> > +                     scnprintf(default_prog_name, sizeof(default_prog_name),
> > +                               "tp/syscalls/sys_%s_%s", type, sc->fmt->alias);
> >                       prog = trace__find_bpf_program_by_title(trace, default_prog_name);
> >                       if (prog != NULL)
> >                               goto out_found;
> > @@ -3336,7 +3328,7 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str
> >       pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
> >                prog_name, type, sc->name);
> >  out_unaugmented:
> > -     return trace->syscalls.unaugmented_prog;
> > +     return trace->skel->progs.syscall_unaugmented;
> >  }
> >
> >  static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
> > @@ -3353,13 +3345,21 @@ static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
> >  static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
> >  {
> >       struct syscall *sc = trace__syscall_info(trace, NULL, id);
> > -     return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->syscalls.unaugmented_prog);
> > +
> > +     if (sc)
> > +             return bpf_program__fd(sc->bpf_prog.sys_enter);
> > +
> > +     return bpf_program__fd(trace->skel->progs.syscall_unaugmented);
> >  }
> >
> >  static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
> >  {
> >       struct syscall *sc = trace__syscall_info(trace, NULL, id);
> > -     return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->syscalls.unaugmented_prog);
> > +
> > +     if (sc)
> > +             return bpf_program__fd(sc->bpf_prog.sys_exit);
> > +
> > +     return bpf_program__fd(trace->skel->progs.syscall_unaugmented);
> >  }
> >
> >  static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
> > @@ -3384,7 +3384,7 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
> >               bool is_candidate = false;
> >
> >               if (pair == NULL || pair == sc ||
> > -                 pair->bpf_prog.sys_enter == trace->syscalls.unaugmented_prog)
> > +                 pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented)
> >                       continue;
> >
> >               for (field = sc->args, candidate_field = pair->args;
> > @@ -3437,7 +3437,7 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
> >                */
> >               if (pair_prog == NULL) {
> >                       pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
> > -                     if (pair_prog == trace->syscalls.unaugmented_prog)
> > +                     if (pair_prog == trace->skel->progs.syscall_unaugmented)
> >                               goto next_candidate;
> >               }
> >
> > @@ -3452,8 +3452,8 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
> >
> >  static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
> >  {
> > -     int map_enter_fd = bpf_map__fd(trace->syscalls.prog_array.sys_enter),
> > -         map_exit_fd  = bpf_map__fd(trace->syscalls.prog_array.sys_exit);
> > +     int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter);
> > +     int map_exit_fd  = bpf_map__fd(trace->skel->maps.syscalls_sys_exit);
> >       int err = 0, key;
> >
> >       for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
> > @@ -3515,7 +3515,7 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
> >                * For now we're just reusing the sys_enter prog, and if it
> >                * already has an augmenter, we don't need to find one.
> >                */
> > -             if (sc->bpf_prog.sys_enter != trace->syscalls.unaugmented_prog)
> > +             if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented)
> >                       continue;
> >
> >               /*
> > @@ -3538,22 +3538,9 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
> >                       break;
> >       }
> >
> > -
> >       return err;
> >  }
> > -
> > -#else // HAVE_LIBBPF_SUPPORT
> > -static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace __maybe_unused,
> > -                                                const char *name __maybe_unused)
> > -{
> > -     return NULL;
> > -}
> > -
> > -static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace __maybe_unused)
> > -{
> > -     return 0;
> > -}
> > -#endif // HAVE_LIBBPF_SUPPORT
> > +#endif // HAVE_BPF_SKEL
> >
> >  static int trace__set_ev_qualifier_filter(struct trace *trace)
> >  {
> > @@ -3917,13 +3904,31 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> >       err = evlist__open(evlist);
> >       if (err < 0)
> >               goto out_error_open;
> > +#ifdef HAVE_BPF_SKEL
> > +     {
> > +             struct perf_cpu cpu;
> >
> > +             /*
> > +              * Set up the __augmented_syscalls__ BPF map to hold for each
> > +              * CPU the bpf-output event's file descriptor.
> > +              */
> > +             perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
> > +                     bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
> > +                                     &cpu.cpu, sizeof(int),
> > +                                     xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
> > +                                                    cpu.cpu, 0),
> > +                                     sizeof(__u32), BPF_ANY);
> > +             }
> > +     }
> > +#endif
> >       err = trace__set_filter_pids(trace);
> >       if (err < 0)
> >               goto out_error_mem;
> >
> > -     if (trace->syscalls.prog_array.sys_enter)
> > +#ifdef HAVE_BPF_SKEL
> > +     if (trace->skel->progs.sys_enter)
> >               trace__init_syscalls_bpf_prog_array_maps(trace);
> > +#endif
> >
> >       if (trace->ev_qualifier_ids.nr > 0) {
> >               err = trace__set_ev_qualifier_filter(trace);
> > @@ -3956,9 +3961,6 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> >       if (err < 0)
> >               goto out_error_apply_filters;
> >
> > -     if (trace->dump.map)
> > -             bpf_map__fprintf(trace->dump.map, trace->output);
> > -
> >       err = evlist__mmap(evlist, trace->opts.mmap_pages);
> >       if (err < 0)
> >               goto out_error_mmap;
> > @@ -4655,6 +4657,18 @@ static void trace__exit(struct trace *trace)
> >       zfree(&trace->perfconfig_events);
> >  }
> >
> > +#ifdef HAVE_BPF_SKEL
> > +static int bpf__setup_bpf_output(struct evlist *evlist)
> > +{
> > +     int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/");
> > +
> > +     if (err)
> > +             pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n");
> > +
> > +     return err;
> > +}
> > +#endif
> > +
> >  int cmd_trace(int argc, const char **argv)
> >  {
> >       const char *trace_usage[] = {
> > @@ -4686,7 +4700,6 @@ int cmd_trace(int argc, const char **argv)
> >               .max_stack = UINT_MAX,
> >               .max_events = ULONG_MAX,
> >       };
> > -     const char *map_dump_str = NULL;
> >       const char *output_name = NULL;
> >       const struct option trace_options[] = {
> >       OPT_CALLBACK('e', "event", &trace, "event",
> > @@ -4720,9 +4733,6 @@ int cmd_trace(int argc, const char **argv)
> >       OPT_CALLBACK(0, "duration", &trace, "float",
> >                    "show only events with duration > N.M ms",
> >                    trace__set_duration),
> > -#ifdef HAVE_LIBBPF_SUPPORT
> > -     OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"),
> > -#endif
> >       OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
> >       OPT_INCR('v', "verbose", &verbose, "be more verbose"),
> >       OPT_BOOLEAN('T', "time", &trace.full_time,
> > @@ -4849,16 +4859,55 @@ int cmd_trace(int argc, const char **argv)
> >                                      "cgroup monitoring only available in system-wide mode");
> >       }
> >
> > -     err = -1;
> > +#ifdef HAVE_BPF_SKEL
> > +     trace.skel = augmented_raw_syscalls_bpf__open();
> > +     if (!trace.skel) {
> > +             pr_debug("Failed to open augmented syscalls BPF skeleton");
> > +     } else {
> > +             /*
> > +              * Disable attaching the BPF programs except for sys_enter and
> > +              * sys_exit that tail call into this as necessary.
> > +              */
> > +             bpf_program__set_autoattach(trace.skel->progs.syscall_unaugmented,
> > +                                         /*autoattach=*/false);
> > +             bpf_program__set_autoattach(trace.skel->progs.sys_enter_connect,
> > +                                         /*autoattach=*/false);
> > +             bpf_program__set_autoattach(trace.skel->progs.sys_enter_sendto,
> > +                                         /*autoattach=*/false);
> > +             bpf_program__set_autoattach(trace.skel->progs.sys_enter_open,
> > +                                         /*autoattach=*/false);
> > +             bpf_program__set_autoattach(trace.skel->progs.sys_enter_openat,
> > +                                         /*autoattach=*/false);
> > +             bpf_program__set_autoattach(trace.skel->progs.sys_enter_rename,
> > +                                         /*autoattach=*/false);
> > +             bpf_program__set_autoattach(trace.skel->progs.sys_enter_renameat,
> > +                                         /*autoattach=*/false);
> > +             bpf_program__set_autoattach(trace.skel->progs.sys_enter_perf_event_open,
> > +                                         /*autoattach=*/false);
> > +             bpf_program__set_autoattach(trace.skel->progs.sys_enter_clock_nanosleep,
> > +                                         /*autoattach=*/false);
> > +
> > +             err = augmented_raw_syscalls_bpf__load(trace.skel);
> >
> > -     if (map_dump_str) {
> > -             trace.dump.map = trace__find_bpf_map_by_name(&trace, map_dump_str);
> > -             if (trace.dump.map == NULL) {
> > -                     pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str);
> > -                     goto out;
> > +             if (err < 0) {
> > +                     pr_debug("Failed to load augmented syscalls BPF skeleton\n");
> > +             } else {
> > +                     augmented_raw_syscalls_bpf__attach(trace.skel);
> > +                     trace__add_syscall_newtp(&trace);
> >               }
> >       }
> >
> > +     err = bpf__setup_bpf_output(trace.evlist);
> > +     if (err) {
> > +             libbpf_strerror(err, bf, sizeof(bf));
> > +             pr_err("ERROR: Setup BPF output event failed: %s\n", bf);
> > +             goto out;
> > +     }
> > +     trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
> > +     assert(!strcmp(evsel__name(trace.syscalls.events.bpf_output), "__augmented_syscalls__"));
> > +#endif
> > +     err = -1;
> > +
> >       if (trace.trace_pgfaults) {
> >               trace.opts.sample_address = true;
> >               trace.opts.sample_time = true;
> > @@ -4909,7 +4958,7 @@ int cmd_trace(int argc, const char **argv)
> >        * buffers that are being copied from kernel to userspace, think 'read'
> >        * syscall.
> >        */
> > -     if (trace.syscalls.events.augmented) {
> > +     if (trace.syscalls.events.bpf_output) {
> >               evlist__for_each_entry(trace.evlist, evsel) {
> >                       bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
> >
> > @@ -4918,9 +4967,9 @@ int cmd_trace(int argc, const char **argv)
> >                               goto init_augmented_syscall_tp;
> >                       }
> >
> > -                     if (trace.syscalls.events.augmented->priv == NULL &&
> > +                     if (trace.syscalls.events.bpf_output->priv == NULL &&
> >                           strstr(evsel__name(evsel), "syscalls:sys_enter")) {
> > -                             struct evsel *augmented = trace.syscalls.events.augmented;
> > +                             struct evsel *augmented = trace.syscalls.events.bpf_output;
> >                               if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
> >                                   evsel__init_augmented_syscall_tp_args(augmented))
> >                                       goto out;
> > @@ -5025,5 +5074,8 @@ int cmd_trace(int argc, const char **argv)
> >               fclose(trace.output);
> >  out:
> >       trace__exit(&trace);
> > +#ifdef HAVE_BPF_SKEL
> > +     augmented_raw_syscalls_bpf__destroy(trace.skel);
> > +#endif
> >       return err;
> >  }
> > diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > similarity index 96%
> > rename from tools/perf/examples/bpf/augmented_raw_syscalls.c
> > rename to tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > index 9a03189d33d3..70478b9460ee 100644
> > --- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
> > +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > @@ -18,6 +18,8 @@
> >  #include <bpf/bpf_helpers.h>
> >  #include <linux/limits.h>
> >
> > +#define MAX_CPUS  4096
> > +
> >  // FIXME: These should come from system headers
> >  typedef char bool;
> >  typedef int pid_t;
> > @@ -34,7 +36,7 @@ struct __augmented_syscalls__ {
> >       __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
> >       __type(key, int);
> >       __type(value, __u32);
> > -     __uint(max_entries, __NR_CPUS__);
> > +     __uint(max_entries, MAX_CPUS);
> >  } __augmented_syscalls__ SEC(".maps");
> >
> >  /*
> > @@ -170,7 +172,7 @@ unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const
> >       return augmented_len;
> >  }
> >
> > -SEC("!raw_syscalls:unaugmented")
> > +SEC("tp/raw_syscalls/sys_enter")
> >  int syscall_unaugmented(struct syscall_enter_args *args)
> >  {
> >       return 1;
> > @@ -182,7 +184,7 @@ int syscall_unaugmented(struct syscall_enter_args *args)
> >   * on from there, reading the first syscall arg as a string, i.e. open's
> >   * filename.
> >   */
> > -SEC("!syscalls:sys_enter_connect")
> > +SEC("tp/syscalls/sys_enter_connect")
> >  int sys_enter_connect(struct syscall_enter_args *args)
> >  {
> >       struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -201,7 +203,7 @@ int sys_enter_connect(struct syscall_enter_args *args)
> >       return augmented__output(args, augmented_args, len + socklen);
> >  }
> >
> > -SEC("!syscalls:sys_enter_sendto")
> > +SEC("tp/syscalls/sys_enter_sendto")
> >  int sys_enter_sendto(struct syscall_enter_args *args)
> >  {
> >       struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -220,7 +222,7 @@ int sys_enter_sendto(struct syscall_enter_args *args)
> >       return augmented__output(args, augmented_args, len + socklen);
> >  }
> >
> > -SEC("!syscalls:sys_enter_open")
> > +SEC("tp/syscalls/sys_enter_open")
> >  int sys_enter_open(struct syscall_enter_args *args)
> >  {
> >       struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -235,7 +237,7 @@ int sys_enter_open(struct syscall_enter_args *args)
> >       return augmented__output(args, augmented_args, len);
> >  }
> >
> > -SEC("!syscalls:sys_enter_openat")
> > +SEC("tp/syscalls/sys_enter_openat")
> >  int sys_enter_openat(struct syscall_enter_args *args)
> >  {
> >       struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -250,7 +252,7 @@ int sys_enter_openat(struct syscall_enter_args *args)
> >       return augmented__output(args, augmented_args, len);
> >  }
> >
> > -SEC("!syscalls:sys_enter_rename")
> > +SEC("tp/syscalls/sys_enter_rename")
> >  int sys_enter_rename(struct syscall_enter_args *args)
> >  {
> >       struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -267,7 +269,7 @@ int sys_enter_rename(struct syscall_enter_args *args)
> >       return augmented__output(args, augmented_args, len);
> >  }
> >
> > -SEC("!syscalls:sys_enter_renameat")
> > +SEC("tp/syscalls/sys_enter_renameat")
> >  int sys_enter_renameat(struct syscall_enter_args *args)
> >  {
> >       struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -295,7 +297,7 @@ struct perf_event_attr_size {
> >          __u32                   size;
> >  };
> >
> > -SEC("!syscalls:sys_enter_perf_event_open")
> > +SEC("tp/syscalls/sys_enter_perf_event_open")
> >  int sys_enter_perf_event_open(struct syscall_enter_args *args)
> >  {
> >       struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -327,7 +329,7 @@ int sys_enter_perf_event_open(struct syscall_enter_args *args)
> >       return 1; /* Failure: don't filter */
> >  }
> >
> > -SEC("!syscalls:sys_enter_clock_nanosleep")
> > +SEC("tp/syscalls/sys_enter_clock_nanosleep")
> >  int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
> >  {
> >       struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -358,7 +360,7 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
> >       return bpf_map_lookup_elem(pids, &pid) != NULL;
> >  }
> >
> > -SEC("raw_syscalls:sys_enter")
> > +SEC("tp/raw_syscalls/sys_enter")
> >  int sys_enter(struct syscall_enter_args *args)
> >  {
> >       struct augmented_args_payload *augmented_args;
> > @@ -371,7 +373,6 @@ int sys_enter(struct syscall_enter_args *args)
> >        * We'll add to this as we add augmented syscalls right after that
> >        * initial, non-augmented raw_syscalls:sys_enter payload.
> >        */
> > -     unsigned int len = sizeof(augmented_args->args);
> >
> >       if (pid_filter__has(&pids_filtered, getpid()))
> >               return 0;
> > @@ -393,7 +394,7 @@ int sys_enter(struct syscall_enter_args *args)
> >       return 0;
> >  }
> >
> > -SEC("raw_syscalls:sys_exit")
> > +SEC("tp/raw_syscalls/sys_exit")
> >  int sys_exit(struct syscall_exit_args *args)
> >  {
> >       struct syscall_exit_args exit_args;
> > --
> > 2.41.0.640.ga95def55d0-goog
> >