[PATCH v2] perf trace: Implement syscall summary in BPF

Namhyung Kim posted 1 patch 9 months ago
There is a newer version of this series
tools/perf/Documentation/perf-trace.txt       |   6 +
tools/perf/Makefile.perf                      |   2 +-
tools/perf/builtin-trace.c                    |  43 ++-
tools/perf/util/Build                         |   1 +
tools/perf/util/bpf-trace-summary.c           | 334 ++++++++++++++++++
.../perf/util/bpf_skel/syscall_summary.bpf.c  | 118 +++++++
tools/perf/util/bpf_skel/syscall_summary.h    |  25 ++
tools/perf/util/trace.h                       |  37 ++
8 files changed, 553 insertions(+), 13 deletions(-)
create mode 100644 tools/perf/util/bpf-trace-summary.c
create mode 100644 tools/perf/util/bpf_skel/syscall_summary.bpf.c
create mode 100644 tools/perf/util/bpf_skel/syscall_summary.h
create mode 100644 tools/perf/util/trace.h
[PATCH v2] perf trace: Implement syscall summary in BPF
Posted by Namhyung Kim 9 months ago
When -s/--summary option is used, it doesn't need (augmented) arguments
of syscalls.  Let's skip the augmentation and load another small BPF
program to collect the statistics in the kernel instead of copying the
data to the ring-buffer to calculate the stats in userspace.  This will
be much more light-weight than the existing approach and remove any lost
events.

Let's add a new option --bpf-summary to control this behavior.  I cannot
make it default because there's no way to get e_machine in the BPF which
is needed for detecting different ABIs like 32-bit compat mode.

No functional changes intended except for no more LOST events. :)

  $ sudo perf trace -as --bpf-summary --summary-mode=total -- sleep 1

   Summary of events:

   total, 2824 events

     syscall            calls  errors  total       min       avg       max       stddev
                                       (msec)    (msec)    (msec)    (msec)        (%)
     --------------- --------  ------ -------- --------- --------- ---------     ------
     futex                372     18  4373.773     0.000    11.757   997.715    660.42%
     poll                 241      0  2757.963     0.000    11.444   997.758    580.34%
     epoll_wait           161      0  2460.854     0.000    15.285   325.189    260.73%
     ppoll                 19      0  1298.652     0.000    68.350   667.172    281.46%
     clock_nanosleep        1      0  1000.093     0.000  1000.093  1000.093      0.00%
     epoll_pwait           16      0   192.787     0.000    12.049   173.994    348.73%
     nanosleep              6      0    50.926     0.000     8.488    10.210     43.96%
     ...

Cc: Howard Chu <howardchu95@gmail.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
v2)
 * rebased on top of Ian's e_machine changes
 * add --bpf-summary option
 * support per-thread summary
 * add stddev calculation  (Howard)

 tools/perf/Documentation/perf-trace.txt       |   6 +
 tools/perf/Makefile.perf                      |   2 +-
 tools/perf/builtin-trace.c                    |  43 ++-
 tools/perf/util/Build                         |   1 +
 tools/perf/util/bpf-trace-summary.c           | 334 ++++++++++++++++++
 .../perf/util/bpf_skel/syscall_summary.bpf.c  | 118 +++++++
 tools/perf/util/bpf_skel/syscall_summary.h    |  25 ++
 tools/perf/util/trace.h                       |  37 ++
 8 files changed, 553 insertions(+), 13 deletions(-)
 create mode 100644 tools/perf/util/bpf-trace-summary.c
 create mode 100644 tools/perf/util/bpf_skel/syscall_summary.bpf.c
 create mode 100644 tools/perf/util/bpf_skel/syscall_summary.h
 create mode 100644 tools/perf/util/trace.h

diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 887dc37773d0f4d6..a8a0d8c33438fef7 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -251,6 +251,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
 	pretty-printing serves as a fallback to hand-crafted pretty printers, as the latter can
 	better pretty-print integer flags and struct pointers.
 
+--bpf-summary::
+	Collect system call statistics in BPF.  This is only for live mode and
+	works well with -s/--summary option where no argument information is
+	required.
+
+
 PAGEFAULTS
 ----------
 
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index f3cd8de15d1a2681..d7a7e0c68fc10b8b 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1206,7 +1206,7 @@ SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
 SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
 SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
 SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
-SKELETONS += $(SKEL_OUT)/kwork_top.skel.h
+SKELETONS += $(SKEL_OUT)/kwork_top.skel.h $(SKEL_OUT)/syscall_summary.skel.h
 SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
 SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h
 
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 7fef59306db2891f..deeb7250e8c52354 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -55,6 +55,7 @@
 #include "util/thread_map.h"
 #include "util/stat.h"
 #include "util/tool.h"
+#include "util/trace.h"
 #include "util/util.h"
 #include "trace/beauty/beauty.h"
 #include "trace-event.h"
@@ -141,12 +142,6 @@ struct syscall_fmt {
 	bool	   hexret;
 };
 
-enum summary_mode {
-	SUMMARY__NONE = 0,
-	SUMMARY__BY_TOTAL,
-	SUMMARY__BY_THREAD,
-};
-
 struct trace {
 	struct perf_tool	tool;
 	struct {
@@ -205,7 +200,7 @@ struct trace {
 	} stats;
 	unsigned int		max_stack;
 	unsigned int		min_stack;
-	enum summary_mode	summary_mode;
+	enum trace_summary_mode	summary_mode;
 	int			raw_augmented_syscalls_args_size;
 	bool			raw_augmented_syscalls;
 	bool			fd_path_disabled;
@@ -234,6 +229,7 @@ struct trace {
 	bool			force;
 	bool			vfs_getname;
 	bool			force_btf;
+	bool			summary_bpf;
 	int			trace_pgfaults;
 	char			*perfconfig_events;
 	struct {
@@ -4356,6 +4352,13 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 
 	trace->live = true;
 
+	if (trace->summary_bpf) {
+		if (trace_prepare_bpf_summary(trace->summary_mode) < 0)
+			goto out_delete_evlist;
+
+		goto create_maps;
+	}
+
 	if (!trace->raw_augmented_syscalls) {
 		if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
 			goto out_error_raw_syscalls;
@@ -4414,6 +4417,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	if (trace->cgroup)
 		evlist__set_default_cgroup(trace->evlist, trace->cgroup);
 
+create_maps:
 	err = evlist__create_maps(evlist, &trace->opts.target);
 	if (err < 0) {
 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
@@ -4426,7 +4430,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 		goto out_delete_evlist;
 	}
 
-	if (trace->summary_mode == SUMMARY__BY_TOTAL) {
+	if (trace->summary_mode == SUMMARY__BY_TOTAL && !trace->summary_bpf) {
 		trace->syscall_stats = alloc_syscall_stats();
 		if (trace->syscall_stats == NULL)
 			goto out_delete_evlist;
@@ -4512,9 +4516,11 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	if (err < 0)
 		goto out_error_apply_filters;
 
-	err = evlist__mmap(evlist, trace->opts.mmap_pages);
-	if (err < 0)
-		goto out_error_mmap;
+	if (!trace->summary_bpf) {
+		err = evlist__mmap(evlist, trace->opts.mmap_pages);
+		if (err < 0)
+			goto out_error_mmap;
+	}
 
 	if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
 		evlist__enable(evlist);
@@ -4527,6 +4533,9 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 		evlist__enable(evlist);
 	}
 
+	if (trace->summary_bpf)
+		trace_start_bpf_summary();
+
 	trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
 		perf_thread_map__nr(evlist->core.threads) > 1 ||
 		evlist__first(evlist)->core.attr.inherit;
@@ -4594,12 +4603,17 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 
 	evlist__disable(evlist);
 
+	if (trace->summary_bpf)
+		trace_end_bpf_summary();
+
 	if (trace->sort_events)
 		ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
 
 	if (!err) {
 		if (trace->summary) {
-			if (trace->summary_mode == SUMMARY__BY_TOTAL)
+			if (trace->summary_bpf)
+				trace_print_bpf_summary(trace->output);
+			else if (trace->summary_mode == SUMMARY__BY_TOTAL)
 				trace__fprintf_total_summary(trace, trace->output);
 			else
 				trace__fprintf_thread_summary(trace, trace->output);
@@ -4615,6 +4629,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	}
 
 out_delete_evlist:
+	trace_cleanup_bpf_summary();
 	delete_syscall_stats(trace->syscall_stats);
 	trace__symbols__exit(trace);
 	evlist__free_syscall_tp_fields(evlist);
@@ -5444,6 +5459,7 @@ int cmd_trace(int argc, const char **argv)
 		     "start"),
 	OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
 		       "to customized ones"),
+	OPT_BOOLEAN(0, "bpf-summary", &trace.summary_bpf, "Summary syscall stats in BPF"),
 	OPTS_EVSWITCH(&trace.evswitch),
 	OPT_END()
 	};
@@ -5535,6 +5551,9 @@ int cmd_trace(int argc, const char **argv)
 		goto skip_augmentation;
 	}
 
+	if (trace.summary_only && trace.summary_bpf)
+		goto skip_augmentation;
+
 	trace.skel = augmented_raw_syscalls_bpf__open();
 	if (!trace.skel) {
 		pr_debug("Failed to open augmented syscalls BPF skeleton");
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 034a6603d5a8e8b0..ba4201a6f3c69753 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -171,6 +171,7 @@ perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf_off_cpu.o
 perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter.o
 perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-flex.o
 perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-bison.o
+perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-trace-summary.o
 perf-util-$(CONFIG_PERF_BPF_SKEL) += btf.o
 
 ifeq ($(CONFIG_LIBTRACEEVENT),y)
diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
new file mode 100644
index 0000000000000000..5ae9feca244d5b22
--- /dev/null
+++ b/tools/perf/util/bpf-trace-summary.c
@@ -0,0 +1,334 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "dwarf-regs.h" /* for EM_HOST */
+#include "syscalltbl.h"
+#include "util/hashmap.h"
+#include "util/trace.h"
+#include "util/util.h"
+#include <bpf/bpf.h>
+#include <linux/time64.h>
+#include <tools/libc_compat.h> /* reallocarray */
+
+#include "bpf_skel/syscall_summary.h"
+#include "bpf_skel/syscall_summary.skel.h"
+
+
+static struct syscall_summary_bpf *skel;
+
+int trace_prepare_bpf_summary(enum trace_summary_mode mode)
+{
+	skel = syscall_summary_bpf__open();
+	if (skel == NULL) {
+		fprintf(stderr, "failed to open syscall summary bpf skeleton\n");
+		return -1;
+	}
+
+	if (mode == SUMMARY__BY_THREAD)
+		skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
+	else
+		skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
+
+	if (syscall_summary_bpf__load(skel) < 0) {
+		fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
+		return -1;
+	}
+
+	if (syscall_summary_bpf__attach(skel) < 0) {
+		fprintf(stderr, "failed to attach syscall summary bpf skeleton\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void trace_start_bpf_summary(void)
+{
+	skel->bss->enabled = 1;
+}
+
+void trace_end_bpf_summary(void)
+{
+	skel->bss->enabled = 0;
+}
+
+struct syscall_node {
+	int syscall_nr;
+	struct syscall_stats stats;
+};
+
+static double rel_stddev(struct syscall_stats *stat)
+{
+	double variance, average;
+
+	if (stat->count < 2)
+		return 0;
+
+	average = (double)stat->total_time / stat->count;
+
+	variance = stat->squared_sum;
+	variance -= (stat->total_time * stat->total_time) / stat->count;
+	variance /= stat->count;
+
+	return 100 * sqrt(variance) / average;
+}
+
+struct syscall_data {
+	int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
+	int nr_events;
+	int nr_nodes;
+	u64 total_time;
+	struct syscall_node *nodes;
+};
+
+static int datacmp(const void *a, const void *b)
+{
+	const struct syscall_data * const *sa = a;
+	const struct syscall_data * const *sb = b;
+
+	return (*sa)->total_time > (*sb)->total_time ? -1 : 1;
+}
+
+static int nodecmp(const void *a, const void *b)
+{
+	const struct syscall_node *na = a;
+	const struct syscall_node *nb = b;
+
+	return na->stats.total_time > nb->stats.total_time ? -1 : 1;
+}
+
+static size_t sc_node_hash(long key, void *ctx __maybe_unused)
+{
+	return key;
+}
+
+static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused)
+{
+	return key1 == key2;
+}
+
+static int print_common_stats(struct syscall_data *data, FILE *fp)
+{
+	int printed = 0;
+
+	for (int i = 0; i < data->nr_nodes; i++) {
+		struct syscall_node *node = &data->nodes[i];
+		struct syscall_stats *stat = &node->stats;
+		double total = (double)(stat->total_time) / NSEC_PER_MSEC;
+		double min = (double)(stat->min_time) / NSEC_PER_MSEC;
+		double max = (double)(stat->max_time) / NSEC_PER_MSEC;
+		double avg = total / stat->count;
+		const char *name;
+
+		/* TODO: support other ABIs */
+		name = syscalltbl__name(EM_HOST, node->syscall_nr);
+		if (name)
+			printed += fprintf(fp, "   %-15s", name);
+		else
+			printed += fprintf(fp, "   syscall:%-7d", node->syscall_nr);
+
+		printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n",
+				   stat->count, stat->error, total, min, avg, max,
+				   rel_stddev(stat));
+	}
+	return printed;
+}
+
+static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key,
+			       struct syscall_stats *map_data)
+{
+	struct syscall_data *data;
+	struct syscall_node *nodes;
+
+	if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) {
+		data = zalloc(sizeof(*data));
+		if (data == NULL)
+			return -ENOMEM;
+
+		data->key = map_key->cpu_or_tid;
+		if (hashmap__add(hash, data->key, data) < 0) {
+			free(data);
+			return -ENOMEM;
+		}
+	}
+
+	/* update thread total stats */
+	data->nr_events += map_data->count;
+	data->total_time += map_data->total_time;
+
+	nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
+	if (nodes == NULL)
+		return -ENOMEM;
+
+	data->nodes = nodes;
+	nodes = &data->nodes[data->nr_nodes++];
+	nodes->syscall_nr = map_key->nr;
+
+	/* each thread has an entry for each syscall, just use the stat */
+	memcpy(&nodes->stats, map_data, sizeof(*map_data));
+	return 0;
+}
+
+static int print_thread_stat(struct syscall_data *data, FILE *fp)
+{
+	int printed = 0;
+
+	qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
+
+	printed += fprintf(fp, " thread (%d), ", data->key);
+	printed += fprintf(fp, "%d events\n\n", data->nr_events);
+
+	printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
+	printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
+	printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
+
+	printed += print_common_stats(data, fp);
+	printed += fprintf(fp, "\n\n");
+
+	return printed;
+}
+
+static int print_thread_stats(struct syscall_data **data, int nr_data, FILE *fp)
+{
+	int printed = 0;
+
+	for (int i = 0; i < nr_data; i++)
+		printed += print_thread_stat(data[i], fp);
+
+	return printed;
+}
+
+static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
+			      struct syscall_stats *map_data)
+{
+	struct syscall_data *data;
+	struct syscall_stats *stat;
+
+	if (!hashmap__find(hash, map_key, &data)) {
+		data = zalloc(sizeof(*data));
+		if (data == NULL)
+			return -ENOMEM;
+
+		data->nodes = zalloc(sizeof(*data->nodes));
+		if (data->nodes == NULL) {
+			free(data);
+			return -ENOMEM;
+		}
+
+		data->nr_nodes = 1;
+		data->key = map_key->nr;
+		data->nodes->syscall_nr = data->key;
+
+		if (hashmap__add(hash, data->key, data) < 0) {
+			free(data->nodes);
+			free(data);
+			return -ENOMEM;
+		}
+	}
+
+	/* update total stats for this syscall */
+	data->nr_events += map_data->count;
+	data->total_time += map_data->total_time;
+
+	/* This is sum of the same syscall from different CPUs */
+	stat = &data->nodes->stats;
+
+	stat->total_time += map_data->total_time;
+	stat->squared_sum += map_data->squared_sum;
+	stat->count += map_data->count;
+	stat->error += map_data->error;
+
+	if (stat->max_time < map_data->max_time)
+		stat->max_time = map_data->max_time;
+	if (stat->min_time > map_data->min_time)
+		stat->min_time = map_data->min_time;
+
+	return 0;
+}
+
+static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
+{
+	int printed = 0;
+	int nr_events = 0;
+
+	for (int i = 0; i < nr_data; i++)
+		nr_events += data[i]->nr_events;
+
+	printed += fprintf(fp, " total, %d events\n\n", nr_events);
+
+	printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
+	printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
+	printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
+
+	for (int i = 0; i < nr_data; i++)
+		printed += print_common_stats(data[i], fp);
+
+	printed += fprintf(fp, "\n\n");
+	return printed;
+}
+
+int trace_print_bpf_summary(FILE *fp)
+{
+	struct bpf_map *map = skel->maps.syscall_stats_map;
+	struct syscall_key *prev_key, key;
+	struct syscall_data **data = NULL;
+	struct hashmap schash;
+	struct hashmap_entry *entry;
+	int nr_data = 0;
+	int printed = 0;
+	int i;
+	size_t bkt;
+
+	hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
+
+	printed = fprintf(fp, "\n Summary of events:\n\n");
+
+	/* get stats from the bpf map */
+	prev_key = NULL;
+	while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
+		struct syscall_stats stat;
+
+		if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
+			if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
+				update_thread_stats(&schash, &key, &stat);
+			else
+				update_total_stats(&schash, &key, &stat);
+		}
+
+		prev_key = &key;
+	}
+
+	nr_data = hashmap__size(&schash);
+	data = calloc(nr_data, sizeof(*data));
+	if (data == NULL)
+		goto out;
+
+	i = 0;
+	hashmap__for_each_entry(&schash, entry, bkt)
+		data[i++] = entry->pvalue;
+
+	qsort(data, nr_data, sizeof(*data), datacmp);
+
+	if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
+		printed += print_thread_stats(data, nr_data, fp);
+	else
+		printed += print_total_stats(data, nr_data, fp);
+
+	for (i = 0; i < nr_data && data; i++) {
+		free(data[i]->nodes);
+		free(data[i]);
+	}
+	free(data);
+
+out:
+	hashmap__clear(&schash);
+	return printed;
+}
+
+void trace_cleanup_bpf_summary(void)
+{
+	syscall_summary_bpf__destroy(skel);
+}
diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
new file mode 100644
index 0000000000000000..b25f53b3c1351392
--- /dev/null
+++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trace raw_syscalls tracepoints to collect system call statistics.
+ */
+
+#include "vmlinux.h"
+#include "syscall_summary.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+/* This is to calculate a delta between sys-enter and sys-exit for each thread */
+struct syscall_trace {
+	int nr; /* syscall number is only available at sys-enter */
+	int unused;
+	u64 timestamp;
+};
+
+#define MAX_ENTRIES	(128 * 1024)
+
+struct syscall_trace_map {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, int); /* tid */
+	__type(value, struct syscall_trace);
+	__uint(max_entries, MAX_ENTRIES);
+} syscall_trace_map SEC(".maps");
+
+struct syscall_stats_map {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, struct syscall_key);
+	__type(value, struct syscall_stats);
+	__uint(max_entries, MAX_ENTRIES);
+} syscall_stats_map SEC(".maps");
+
+int enabled; /* controlled from userspace */
+
+const volatile enum syscall_aggr_mode aggr_mode;
+
+static void update_stats(int cpu_or_tid, int nr, s64 duration, long ret)
+{
+	struct syscall_key key = { .cpu_or_tid = cpu_or_tid, .nr = nr, };
+	struct syscall_stats *stats;
+
+	stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
+	if (stats == NULL) {
+		struct syscall_stats zero = {};
+
+		bpf_map_update_elem(&syscall_stats_map, &key, &zero, BPF_NOEXIST);
+		stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
+		if (stats == NULL)
+			return;
+	}
+
+	__sync_fetch_and_add(&stats->count, 1);
+	if (ret < 0)
+		__sync_fetch_and_add(&stats->error, 1);
+
+	if (duration > 0) {
+		__sync_fetch_and_add(&stats->total_time, duration);
+		__sync_fetch_and_add(&stats->squared_sum, duration * duration);
+		if (stats->max_time < duration)
+			stats->max_time = duration;
+		if (stats->min_time > duration || stats->min_time == 0)
+			stats->min_time = duration;
+	}
+
+	return;
+}
+
+SEC("tp_btf/sys_enter")
+int sys_enter(u64 *ctx)
+{
+	int tid;
+	struct syscall_trace st;
+
+	if (!enabled)
+		return 0;
+
+	st.nr = ctx[1]; /* syscall number */
+	st.unused = 0;
+	st.timestamp = bpf_ktime_get_ns();
+
+	tid = bpf_get_current_pid_tgid();
+	bpf_map_update_elem(&syscall_trace_map, &tid, &st, BPF_ANY);
+
+	return 0;
+}
+
+SEC("tp_btf/sys_exit")
+int sys_exit(u64 *ctx)
+{
+	int tid;
+	int key;
+	long ret = ctx[1]; /* return value of the syscall */
+	struct syscall_trace *st;
+	s64 delta;
+
+	if (!enabled)
+		return 0;
+
+	tid = bpf_get_current_pid_tgid();
+	st = bpf_map_lookup_elem(&syscall_trace_map, &tid);
+	if (st == NULL)
+		return 0;
+
+	if (aggr_mode == SYSCALL_AGGR_THREAD)
+		key = tid;
+	else
+		key = bpf_get_smp_processor_id();
+
+	delta = bpf_ktime_get_ns() - st->timestamp;
+	update_stats(key, st->nr, delta, ret);
+
+	bpf_map_delete_elem(&syscall_trace_map, &tid);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/perf/util/bpf_skel/syscall_summary.h b/tools/perf/util/bpf_skel/syscall_summary.h
new file mode 100644
index 0000000000000000..17f9ecba657088aa
--- /dev/null
+++ b/tools/perf/util/bpf_skel/syscall_summary.h
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Data structures shared between BPF and tools. */
+#ifndef UTIL_BPF_SKEL_SYSCALL_SUMMARY_H
+#define UTIL_BPF_SKEL_SYSCALL_SUMMARY_H
+
+enum syscall_aggr_mode {
+	SYSCALL_AGGR_THREAD,
+	SYSCALL_AGGR_CPU,
+};
+
+struct syscall_key {
+	int cpu_or_tid;
+	int nr;
+};
+
+struct syscall_stats {
+	u64 total_time;
+	u64 squared_sum;
+	u64 max_time;
+	u64 min_time;
+	u32 count;
+	u32 error;
+};
+
+#endif /* UTIL_BPF_SKEL_SYSCALL_SUMMARY_H */
diff --git a/tools/perf/util/trace.h b/tools/perf/util/trace.h
new file mode 100644
index 0000000000000000..ef8361ed12c4edc1
--- /dev/null
+++ b/tools/perf/util/trace.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef UTIL_TRACE_H
+#define UTIL_TRACE_H
+
+#include <stdio.h>  /* for FILE */
+
+enum trace_summary_mode {
+	SUMMARY__NONE = 0,
+	SUMMARY__BY_TOTAL,
+	SUMMARY__BY_THREAD,
+};
+
+#ifdef HAVE_BPF_SKEL
+
+int trace_prepare_bpf_summary(enum trace_summary_mode mode);
+void trace_start_bpf_summary(void);
+void trace_end_bpf_summary(void);
+int trace_print_bpf_summary(FILE *fp);
+void trace_cleanup_bpf_summary(void);
+
+#else /* !HAVE_BPF_SKEL */
+
+static inline int trace_prepare_bpf_summary(enum trace_summary_mode mode __maybe_unused)
+{
+	return -1;
+}
+static inline void trace_start_bpf_summary(void) {}
+static inline void trace_end_bpf_summary(void) {}
+static inline int trace_print_bpf_summary(FILE *fp __maybe_unused)
+{
+	return 0;
+}
+static inline void trace_cleanup_bpf_summary(void) {}
+
+#endif /* HAVE_BPF_SKEL */
+
+#endif /* UTIL_TRACE_H */
-- 
2.49.0.rc1.451.g8f38331e32-goog
Re: [PATCH v2] perf trace: Implement syscall summary in BPF
Posted by Howard Chu 9 months ago
Hello again Namhyung,

As funny as it sounds, I have too much homework this week. I had to
break the review into two parts. Sorry.

1) Maybe just '--bpf-summary' instead?

First of all, is '-s --bpf-summary' is it ergonomic? Why not drop the
-s and just --bpf-summary since the option has 'summary' in its name.
Another reason being,
sudo ./perf trace -S --bpf-summary --summary-mode=total -- sleep 1
sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
are the same (-S will emit no output to stdout).

2) Anomaly observed when playing around

sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
this gave me 10000 events

sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1
while this gave me 1000 events

I guess it's something to do with the lost events?

3) Wrong stddev values
Please compare these two outputs

perf $ sudo ./perf trace -as --summary-mode=total -- sleep 1

 Summary of events:

 total, 11290 events

   syscall            calls  errors  total       min       avg
max       stddev
                                     (msec)    (msec)    (msec)
(msec)        (%)
   --------------- --------  ------ -------- --------- ---------
---------     ------
   mq_open              214     71 16073.976     0.000    75.112
250.120      9.91%
   futex               1296    195 11592.060     0.000     8.944
907.590     13.59%
   epoll_wait           479      0  4262.456     0.000     8.899
496.568     20.34%
   poll                 241      0  2545.090     0.000    10.561
607.894     33.33%
   ppoll                330      0  1713.676     0.000     5.193
410.143     26.45%
   migrate_pages         45      0  1031.915     0.000    22.931
147.830     20.70%
   clock_nanosleep        2      0  1000.106     0.000   500.053
1000.106    100.00%
   swapoff              340      0   909.827     0.000     2.676
50.117     22.76%
   pselect6               5      0   604.816     0.000   120.963
604.808    100.00%
   readlinkat            26      3   501.205     0.000    19.277
499.998     99.75%

perf $ sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1

 Summary of events:

 total, 880 events

   syscall            calls  errors  total       min       avg
max       stddev
                                     (msec)    (msec)    (msec)
(msec)        (%)
   --------------- --------  ------ -------- --------- ---------
---------     ------
   futex                219     46  2326.400     0.001    10.623
243.028    337.77%
   mq_open               19      8  2001.347     0.003   105.334
250.356    117.26%
   poll                   6      1  1002.512     0.002   167.085
1002.496    223.60%
   clock_nanosleep        1      0  1000.147  1000.147  1000.147
1000.147      0.00%
   swapoff               43      0   953.251     0.001    22.169
50.390    112.37%
   migrate_pages         43      0   933.727     0.004    21.715
49.149    106.68%
   ppoll                 32      0   838.035     0.002    26.189
331.222    252.10%
   epoll_pwait            5      0   499.578     0.001    99.916
499.565    199.99%
   nanosleep              1      0    10.149    10.149    10.149
10.149      0.00%
   epoll_wait            10      0     3.449     0.003     0.345
0.815     88.02%
   readlinkat            25      3     1.424     0.006     0.057
0.080     41.76%
   recvmsg               61      0     1.326     0.016     0.022
0.052     21.71%
   execve                 6      5     1.100     0.002     0.183
1.078    218.21%

I would say stddev here is a little off. The reason is:

On Mon, Mar 17, 2025 at 11:08 AM Namhyung Kim <namhyung@kernel.org> wrote:
>
> When -s/--summary option is used, it doesn't need (augmented) arguments
> of syscalls.  Let's skip the augmentation and load another small BPF
> program to collect the statistics in the kernel instead of copying the
> data to the ring-buffer to calculate the stats in userspace.  This will
> be much more light-weight than the existing approach and remove any lost
> events.
>
> Let's add a new option --bpf-summary to control this behavior.  I cannot
> make it default because there's no way to get e_machine in the BPF which
> is needed for detecting different ABIs like 32-bit compat mode.
>
> No functional changes intended except for no more LOST events. :)
>
>   $ sudo perf trace -as --bpf-summary --summary-mode=total -- sleep 1
>
>    Summary of events:
>
>    total, 2824 events
>
>      syscall            calls  errors  total       min       avg       max       stddev
>                                        (msec)    (msec)    (msec)    (msec)        (%)
>      --------------- --------  ------ -------- --------- --------- ---------     ------
>      futex                372     18  4373.773     0.000    11.757   997.715    660.42%
>      poll                 241      0  2757.963     0.000    11.444   997.758    580.34%
>      epoll_wait           161      0  2460.854     0.000    15.285   325.189    260.73%
>      ppoll                 19      0  1298.652     0.000    68.350   667.172    281.46%
>      clock_nanosleep        1      0  1000.093     0.000  1000.093  1000.093      0.00%
>      epoll_pwait           16      0   192.787     0.000    12.049   173.994    348.73%
>      nanosleep              6      0    50.926     0.000     8.488    10.210     43.96%
>      ...
>
> Cc: Howard Chu <howardchu95@gmail.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> v2)
>  * rebased on top of Ian's e_machine changes
>  * add --bpf-summary option
>  * support per-thread summary
>  * add stddev calculation  (Howard)
<SNIP>
> +static double rel_stddev(struct syscall_stats *stat)
> +{
> +       double variance, average;
> +
> +       if (stat->count < 2)
> +               return 0;
> +
> +       average = (double)stat->total_time / stat->count;
> +
> +       variance = stat->squared_sum;
> +       variance -= (stat->total_time * stat->total_time) / stat->count;
> +       variance /= stat->count;

isn't it 'variance /= stat->count - 1' because we used Bessel's
correction? (Link:
https://en.wikipedia.org/wiki/Bessel%27s_correction), that is to use n
- 1 instead of n, this is what's done in stat.c.

 *       (\Sum n_i^2) - ((\Sum n_i)^2)/n
 * s^2 = -------------------------------
 *                  n - 1

and the lines down here are unfortunately incorrect
+ variance = stat->squared_sum;
+ variance -= (stat->total_time * stat->total_time) / stat->count;
+ variance /= stat->count;
+
+ return 100 * sqrt(variance) / average;

variance /= stat->count - 1; will get you variance, but I think we
need variance mean.
Link: https://en.wikipedia.org/wiki/Standard_deviation#Relationship_between_standard_deviation_and_mean

it holds that:
variance(mean) = variance / N

so you are losing a '/ stat->count'

And with all due respect, although it makes total sense in
engineering, mathematically, I find variance = stat->squared_sum,
variance -= ... these accumulated calculations on variable 'variance'
a little weird... because readers may find difficult to determine at
which point it becomes the actual 'variance'

with clarity in mind:

diff --git a/tools/perf/util/bpf-trace-summary.c
b/tools/perf/util/bpf-trace-summary.c
index 5ae9feca244d..a435b4037082 100644
--- a/tools/perf/util/bpf-trace-summary.c
+++ b/tools/perf/util/bpf-trace-summary.c
@@ -62,18 +62,18 @@ struct syscall_node {

 static double rel_stddev(struct syscall_stats *stat)
 {
-       double variance, average;
+       double variance, average, squared_total;

        if (stat->count < 2)
                return 0;

        average = (double)stat->total_time / stat->count;

-       variance = stat->squared_sum;
-       variance -= (stat->total_time * stat->total_time) / stat->count;
-       variance /= stat->count;
+       squared_total = stat->total_time * stat->total_time;
+       variance = (stat->squared_sum - squared_total / stat->count) /
(stat->count - 1);
+       stddev_mean = sqrt(variance / stat->count);

-       return 100 * sqrt(variance) / average;
+       return 100 * stddev_mean / average;
 }

btw I haven't checked the legal range for stddev_mean, so I can be wrong.
<SNIP>
> +static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
> +                             struct syscall_stats *map_data)
> +{
> +       struct syscall_data *data;
> +       struct syscall_stats *stat;
> +
> +       if (!hashmap__find(hash, map_key, &data)) {
> +               data = zalloc(sizeof(*data));
> +               if (data == NULL)
> +                       return -ENOMEM;
> +
> +               data->nodes = zalloc(sizeof(*data->nodes));
> +               if (data->nodes == NULL) {
> +                       free(data);
> +                       return -ENOMEM;
> +               }
> +
> +               data->nr_nodes = 1;
> +               data->key = map_key->nr;
> +               data->nodes->syscall_nr = data->key;
Wow, aggressive. I guess you want it to behave like a single value
when it is SYSCALL_AGGR_CPU, and an array when it is
SYSCALL_AGGR_THREAD. Do you mind adding a comment about it?

so it's

(cpu, syscall_nr) -> data -> {node}
(tid, syscall_nr) -> data -> [node1, node2, node3]


> +
> +               if (hashmap__add(hash, data->key, data) < 0) {
> +                       free(data->nodes);
> +                       free(data);
> +                       return -ENOMEM;
> +               }
> +       }
> +
> +       /* update total stats for this syscall */
> +       data->nr_events += map_data->count;
> +       data->total_time += map_data->total_time;
> +
> +       /* This is sum of the same syscall from different CPUs */
> +       stat = &data->nodes->stats;
> +
> +       stat->total_time += map_data->total_time;
> +       stat->squared_sum += map_data->squared_sum;
> +       stat->count += map_data->count;
> +       stat->error += map_data->error;
> +
> +       if (stat->max_time < map_data->max_time)
> +               stat->max_time = map_data->max_time;
> +       if (stat->min_time > map_data->min_time)
> +               stat->min_time = map_data->min_time;
> +
> +       return 0;
> +}
> +
> +static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
> +{
> +       int printed = 0;
> +       int nr_events = 0;
> +
> +       for (int i = 0; i < nr_data; i++)
> +               nr_events += data[i]->nr_events;
> +
> +       printed += fprintf(fp, " total, %d events\n\n", nr_events);
> +
> +       printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
> +       printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
> +       printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
> +
> +       for (int i = 0; i < nr_data; i++)
> +               printed += print_common_stats(data[i], fp);
> +
> +       printed += fprintf(fp, "\n\n");
> +       return printed;
> +}
> +
> +int trace_print_bpf_summary(FILE *fp)
> +{
> +       struct bpf_map *map = skel->maps.syscall_stats_map;
> +       struct syscall_key *prev_key, key;
> +       struct syscall_data **data = NULL;
> +       struct hashmap schash;
> +       struct hashmap_entry *entry;
> +       int nr_data = 0;
> +       int printed = 0;
> +       int i;
> +       size_t bkt;
> +
> +       hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
> +
> +       printed = fprintf(fp, "\n Summary of events:\n\n");
> +
> +       /* get stats from the bpf map */
> +       prev_key = NULL;
> +       while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
> +               struct syscall_stats stat;
> +
> +               if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
> +                       if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> +                               update_thread_stats(&schash, &key, &stat);
> +                       else
> +                               update_total_stats(&schash, &key, &stat);
> +               }
> +
> +               prev_key = &key;
> +       }
> +
> +       nr_data = hashmap__size(&schash);
> +       data = calloc(nr_data, sizeof(*data));
> +       if (data == NULL)
> +               goto out;
> +
> +       i = 0;
> +       hashmap__for_each_entry(&schash, entry, bkt)
> +               data[i++] = entry->pvalue;
> +
> +       qsort(data, nr_data, sizeof(*data), datacmp);

Here syscall_data is sorted for AGGR_THREAD and AGGR_CPU, meaning the
thread who has the higher total syscall period will be printed first.
This is an awesome side effect but it is not the behavior of 'sudo
./perf trace -as -- sleep 1' without the --bpf-summary option. If it
is not too trivial, maybe consider documenting this behavior? But it
may be too verbose so Idk.

sudo ./perf trace -as -- sleep 1

 ClientModuleMan (4956), 16 events, 0.1%

   syscall            calls  errors  total       min       avg
max       stddev
                                     (msec)    (msec)    (msec)
(msec)        (%)
   --------------- --------  ------ -------- --------- ---------
---------     ------
   futex                  8      4   750.234     0.000    93.779
250.105     48.79%


 CHTTPClientThre (15720), 16 events, 0.1%

   syscall            calls  errors  total       min       avg
max       stddev
                                     (msec)    (msec)    (msec)
(msec)        (%)
   --------------- --------  ------ -------- --------- ---------
---------     ------
   futex                  8      4  1000.425     0.000   125.053
750.317     75.59%

The order is random for the command above.

Thanks,
Howard
Re: [PATCH v2] perf trace: Implement syscall summary in BPF
Posted by Namhyung Kim 9 months ago
On Thu, Mar 20, 2025 at 07:35:01PM -0700, Howard Chu wrote:
> Hello again Namhyung,
> 
> As funny as it sounds, I have too much homework this week. I had to
> break the review into two parts. Sorry.

Thanks for taking your time!

> 
> 1) Maybe just '--bpf-summary' instead?
> 
> First of all, is '-s --bpf-summary' is it ergonomic? Why not drop the
> -s and just --bpf-summary since the option has 'summary' in its name.
> Another reason being,
> sudo ./perf trace -S --bpf-summary --summary-mode=total -- sleep 1
> sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
> are the same (-S will emit no output to stdout).

Hmm.. it looks like a bug, will take a look.  Maybe --bpf-summary is
redundant, but I think we can make it default later so that -s/-S can
use BPF without the option excplicitly.  Then this option can be used
to disable it (--no-bpf-summary) for whatever reasons.

> 
> 2) Anomaly observed when playing around
> 
> sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
> this gave me 10000 events
> 
> sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1
> while this gave me 1000 events
> 
> I guess it's something to do with the lost events?

No, as you said in the previous message it didn't support process
targets yet.  I plan to disable it without -a for now.

> 
> 3) Wrong stddev values
> Please compare these two outputs
> 
> perf $ sudo ./perf trace -as --summary-mode=total -- sleep 1
> 
>  Summary of events:
> 
>  total, 11290 events
> 
>    syscall            calls  errors  total       min       avg
> max       stddev
>                                      (msec)    (msec)    (msec)
> (msec)        (%)
>    --------------- --------  ------ -------- --------- ---------
> ---------     ------
>    mq_open              214     71 16073.976     0.000    75.112
> 250.120      9.91%
>    futex               1296    195 11592.060     0.000     8.944
> 907.590     13.59%
>    epoll_wait           479      0  4262.456     0.000     8.899
> 496.568     20.34%
>    poll                 241      0  2545.090     0.000    10.561
> 607.894     33.33%
>    ppoll                330      0  1713.676     0.000     5.193
> 410.143     26.45%
>    migrate_pages         45      0  1031.915     0.000    22.931
> 147.830     20.70%
>    clock_nanosleep        2      0  1000.106     0.000   500.053
> 1000.106    100.00%
>    swapoff              340      0   909.827     0.000     2.676
> 50.117     22.76%
>    pselect6               5      0   604.816     0.000   120.963
> 604.808    100.00%
>    readlinkat            26      3   501.205     0.000    19.277
> 499.998     99.75%
> 
> perf $ sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1
> 
>  Summary of events:
> 
>  total, 880 events
> 
>    syscall            calls  errors  total       min       avg
> max       stddev
>                                      (msec)    (msec)    (msec)
> (msec)        (%)
>    --------------- --------  ------ -------- --------- ---------
> ---------     ------
>    futex                219     46  2326.400     0.001    10.623
> 243.028    337.77%
>    mq_open               19      8  2001.347     0.003   105.334
> 250.356    117.26%
>    poll                   6      1  1002.512     0.002   167.085
> 1002.496    223.60%
>    clock_nanosleep        1      0  1000.147  1000.147  1000.147
> 1000.147      0.00%
>    swapoff               43      0   953.251     0.001    22.169
> 50.390    112.37%
>    migrate_pages         43      0   933.727     0.004    21.715
> 49.149    106.68%
>    ppoll                 32      0   838.035     0.002    26.189
> 331.222    252.10%
>    epoll_pwait            5      0   499.578     0.001    99.916
> 499.565    199.99%
>    nanosleep              1      0    10.149    10.149    10.149
> 10.149      0.00%
>    epoll_wait            10      0     3.449     0.003     0.345
> 0.815     88.02%
>    readlinkat            25      3     1.424     0.006     0.057
> 0.080     41.76%
>    recvmsg               61      0     1.326     0.016     0.022
> 0.052     21.71%
>    execve                 6      5     1.100     0.002     0.183
> 1.078    218.21%
> 
> I would say stddev here is a little off. The reason is:
> 
> On Mon, Mar 17, 2025 at 11:08 AM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > When -s/--summary option is used, it doesn't need (augmented) arguments
> > of syscalls.  Let's skip the augmentation and load another small BPF
> > program to collect the statistics in the kernel instead of copying the
> > data to the ring-buffer to calculate the stats in userspace.  This will
> > be much more light-weight than the existing approach and remove any lost
> > events.
> >
> > Let's add a new option --bpf-summary to control this behavior.  I cannot
> > make it default because there's no way to get e_machine in the BPF which
> > is needed for detecting different ABIs like 32-bit compat mode.
> >
> > No functional changes intended except for no more LOST events. :)
> >
> >   $ sudo perf trace -as --bpf-summary --summary-mode=total -- sleep 1
> >
> >    Summary of events:
> >
> >    total, 2824 events
> >
> >      syscall            calls  errors  total       min       avg       max       stddev
> >                                        (msec)    (msec)    (msec)    (msec)        (%)
> >      --------------- --------  ------ -------- --------- --------- ---------     ------
> >      futex                372     18  4373.773     0.000    11.757   997.715    660.42%
> >      poll                 241      0  2757.963     0.000    11.444   997.758    580.34%
> >      epoll_wait           161      0  2460.854     0.000    15.285   325.189    260.73%
> >      ppoll                 19      0  1298.652     0.000    68.350   667.172    281.46%
> >      clock_nanosleep        1      0  1000.093     0.000  1000.093  1000.093      0.00%
> >      epoll_pwait           16      0   192.787     0.000    12.049   173.994    348.73%
> >      nanosleep              6      0    50.926     0.000     8.488    10.210     43.96%
> >      ...
> >
> > Cc: Howard Chu <howardchu95@gmail.com>
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> > v2)
> >  * rebased on top of Ian's e_machine changes
> >  * add --bpf-summary option
> >  * support per-thread summary
> >  * add stddev calculation  (Howard)
> <SNIP>
> > +static double rel_stddev(struct syscall_stats *stat)
> > +{
> > +       double variance, average;
> > +
> > +       if (stat->count < 2)
> > +               return 0;
> > +
> > +       average = (double)stat->total_time / stat->count;
> > +
> > +       variance = stat->squared_sum;
> > +       variance -= (stat->total_time * stat->total_time) / stat->count;
> > +       variance /= stat->count;
> 
> isn't it 'variance /= stat->count - 1' because we used Bessel's
> correction? (Link:
> https://en.wikipedia.org/wiki/Bessel%27s_correction), that is to use n
> - 1 instead of n, this is what's done in stat.c.
> 
>  *       (\Sum n_i^2) - ((\Sum n_i)^2)/n
>  * s^2 = -------------------------------
>  *                  n - 1
> 
> and the lines down here are unfortunately incorrect
> + variance = stat->squared_sum;
> + variance -= (stat->total_time * stat->total_time) / stat->count;
> + variance /= stat->count;
> +
> + return 100 * sqrt(variance) / average;
> 
> variance /= stat->count - 1; will get you variance, but I think we
> need variance mean.
> Link: https://en.wikipedia.org/wiki/Standard_deviation#Relationship_between_standard_deviation_and_mean
> 
> it holds that:
> variance(mean) = variance / N
> 
> so you are losing a '/ stat->count'

You're right, thanks for pointing that out.

> 
> And with all due respect, although it makes total sense in
> engineering, mathematically, I find variance = stat->squared_sum,
> variance -= ... these accumulated calculations on variable 'variance'
> a little weird... because readers may find difficult to determine at
> which point it becomes the actual 'variance'
> 
> with clarity in mind:
> 
> diff --git a/tools/perf/util/bpf-trace-summary.c
> b/tools/perf/util/bpf-trace-summary.c
> index 5ae9feca244d..a435b4037082 100644
> --- a/tools/perf/util/bpf-trace-summary.c
> +++ b/tools/perf/util/bpf-trace-summary.c
> @@ -62,18 +62,18 @@ struct syscall_node {
> 
>  static double rel_stddev(struct syscall_stats *stat)
>  {
> -       double variance, average;
> +       double variance, average, squared_total;
> 
>         if (stat->count < 2)
>                 return 0;
> 
>         average = (double)stat->total_time / stat->count;
> 
> -       variance = stat->squared_sum;
> -       variance -= (stat->total_time * stat->total_time) / stat->count;
> -       variance /= stat->count;
> +       squared_total = stat->total_time * stat->total_time;
> +       variance = (stat->squared_sum - squared_total / stat->count) /
> (stat->count - 1);
> +       stddev_mean = sqrt(variance / stat->count);
> 
> -       return 100 * sqrt(variance) / average;
> +       return 100 * stddev_mean / average;
>  }

Can it be like this?

diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
index a91d42447e850a59..c897fb017914960c 100644
--- a/tools/perf/util/bpf-trace-summary.c
+++ b/tools/perf/util/bpf-trace-summary.c
@@ -71,9 +71,9 @@ static double rel_stddev(struct syscall_stats *stat)
 
        variance = stat->squared_sum;
        variance -= (stat->total_time * stat->total_time) / stat->count;
-       variance /= stat->count;
+       variance /= stat->count - 1;
 
-       return 100 * sqrt(variance) / average;
+       return 100 * sqrt(variance / stat->count) / average;
 }
 
 struct syscall_data {

> 
> btw I haven't checked the legal range for stddev_mean, so I can be wrong.
> <SNIP>
> > +static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
> > +                             struct syscall_stats *map_data)
> > +{
> > +       struct syscall_data *data;
> > +       struct syscall_stats *stat;
> > +
> > +       if (!hashmap__find(hash, map_key, &data)) {
> > +               data = zalloc(sizeof(*data));
> > +               if (data == NULL)
> > +                       return -ENOMEM;
> > +
> > +               data->nodes = zalloc(sizeof(*data->nodes));
> > +               if (data->nodes == NULL) {
> > +                       free(data);
> > +                       return -ENOMEM;
> > +               }
> > +
> > +               data->nr_nodes = 1;
> > +               data->key = map_key->nr;
> > +               data->nodes->syscall_nr = data->key;
> Wow, aggressive. I guess you want it to behave like a single value
> when it is SYSCALL_AGGR_CPU, and an array when it is
> SYSCALL_AGGR_THREAD. Do you mind adding a comment about it?
> 
> so it's
> 
> (cpu, syscall_nr) -> data -> {node}
> (tid, syscall_nr) -> data -> [node1, node2, node3]

Right, will add comments.

> 
> 
> > +
> > +               if (hashmap__add(hash, data->key, data) < 0) {
> > +                       free(data->nodes);
> > +                       free(data);
> > +                       return -ENOMEM;
> > +               }
> > +       }
> > +
> > +       /* update total stats for this syscall */
> > +       data->nr_events += map_data->count;
> > +       data->total_time += map_data->total_time;
> > +
> > +       /* This is sum of the same syscall from different CPUs */
> > +       stat = &data->nodes->stats;
> > +
> > +       stat->total_time += map_data->total_time;
> > +       stat->squared_sum += map_data->squared_sum;
> > +       stat->count += map_data->count;
> > +       stat->error += map_data->error;
> > +
> > +       if (stat->max_time < map_data->max_time)
> > +               stat->max_time = map_data->max_time;
> > +       if (stat->min_time > map_data->min_time)
> > +               stat->min_time = map_data->min_time;
> > +
> > +       return 0;
> > +}
> > +
> > +static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
> > +{
> > +       int printed = 0;
> > +       int nr_events = 0;
> > +
> > +       for (int i = 0; i < nr_data; i++)
> > +               nr_events += data[i]->nr_events;
> > +
> > +       printed += fprintf(fp, " total, %d events\n\n", nr_events);
> > +
> > +       printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
> > +       printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
> > +       printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
> > +
> > +       for (int i = 0; i < nr_data; i++)
> > +               printed += print_common_stats(data[i], fp);
> > +
> > +       printed += fprintf(fp, "\n\n");
> > +       return printed;
> > +}
> > +
> > +int trace_print_bpf_summary(FILE *fp)
> > +{
> > +       struct bpf_map *map = skel->maps.syscall_stats_map;
> > +       struct syscall_key *prev_key, key;
> > +       struct syscall_data **data = NULL;
> > +       struct hashmap schash;
> > +       struct hashmap_entry *entry;
> > +       int nr_data = 0;
> > +       int printed = 0;
> > +       int i;
> > +       size_t bkt;
> > +
> > +       hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
> > +
> > +       printed = fprintf(fp, "\n Summary of events:\n\n");
> > +
> > +       /* get stats from the bpf map */
> > +       prev_key = NULL;
> > +       while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
> > +               struct syscall_stats stat;
> > +
> > +               if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
> > +                       if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> > +                               update_thread_stats(&schash, &key, &stat);
> > +                       else
> > +                               update_total_stats(&schash, &key, &stat);
> > +               }
> > +
> > +               prev_key = &key;
> > +       }
> > +
> > +       nr_data = hashmap__size(&schash);
> > +       data = calloc(nr_data, sizeof(*data));
> > +       if (data == NULL)
> > +               goto out;
> > +
> > +       i = 0;
> > +       hashmap__for_each_entry(&schash, entry, bkt)
> > +               data[i++] = entry->pvalue;
> > +
> > +       qsort(data, nr_data, sizeof(*data), datacmp);
> 
> Here syscall_data is sorted for AGGR_THREAD and AGGR_CPU, meaning the
> thread who has the higher total syscall period will be printed first.
> This is an awesome side effect but it is not the behavior of 'sudo
> ./perf trace -as -- sleep 1' without the --bpf-summary option. If it
> is not too trivial, maybe consider documenting this behavior? But it
> may be too verbose so Idk.

The original behavior is random ordering and now it's ordered by total
time.  But still we can think it's randomly ordered.  I believe we need
to maintain strict ordering by total time and then fixed the existing
code.  Once that happen I can add the documentation.

Thanks for your careful review!
Namhyung

> 
> sudo ./perf trace -as -- sleep 1
> 
>  ClientModuleMan (4956), 16 events, 0.1%
> 
>    syscall            calls  errors  total       min       avg
> max       stddev
>                                      (msec)    (msec)    (msec)
> (msec)        (%)
>    --------------- --------  ------ -------- --------- ---------
> ---------     ------
>    futex                  8      4   750.234     0.000    93.779
> 250.105     48.79%
> 
> 
>  CHTTPClientThre (15720), 16 events, 0.1%
> 
>    syscall            calls  errors  total       min       avg
> max       stddev
>                                      (msec)    (msec)    (msec)
> (msec)        (%)
>    --------------- --------  ------ -------- --------- ---------
> ---------     ------
>    futex                  8      4  1000.425     0.000   125.053
> 750.317     75.59%
> 
> The order is random for the command above.
> 
> Thanks,
> Howard
Re: [PATCH v2] perf trace: Implement syscall summary in BPF
Posted by Howard Chu 9 months ago
Hello Namhyung,

On Thu, Mar 20, 2025 at 10:54 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> On Thu, Mar 20, 2025 at 07:35:01PM -0700, Howard Chu wrote:
> > Hello again Namhyung,
> >
> > As funny as it sounds, I have too much homework this week. I had to
> > break the review into two parts. Sorry.
>
> Thanks for taking your time!
>
> >
> > 1) Maybe just '--bpf-summary' instead?
> >
> > First of all, is '-s --bpf-summary' is it ergonomic? Why not drop the
> > -s and just --bpf-summary since the option has 'summary' in its name.
> > Another reason being,
> > sudo ./perf trace -S --bpf-summary --summary-mode=total -- sleep 1
> > sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
> > are the same (-S will emit no output to stdout).
>
> Hmm.. it looks like a bug, will take a look.  Maybe --bpf-summary is
> redundant, but I think we can make it default later so that -s/-S can
> use BPF without the option excplicitly.  Then this option can be used
> to disable it (--no-bpf-summary) for whatever reasons.
>
> >
> > 2) Anomaly observed when playing around
> >
> > sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
> > this gave me 10000 events
> >
> > sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1
> > while this gave me 1000 events
> >
> > I guess it's something to do with the lost events?
>
> No, as you said in the previous message it didn't support process
> targets yet.  I plan to disable it without -a for now.

I copied the wrong command in the email, it should be:

there is a difference between

sudo ./perf trace -as --summary-mode=total -- sleep 1
sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1

perf $ sudo ./perf trace -as --summary-mode=total -- sleep 1
[sudo] password for howard:

 Summary of events:

 total, 15354 events

perf $ sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1


 Summary of events:

 total, 1319 events


without the --bpf-summary perf trace gave more events, and it ran slower

as for 'I plan to disable it without -a for now.' I think this makes sense.

>
> >
> > 3) Wrong stddev values
> > Please compare these two outputs
> >
> > perf $ sudo ./perf trace -as --summary-mode=total -- sleep 1
> >
> >  Summary of events:
> >
> >  total, 11290 events
> >
> >    syscall            calls  errors  total       min       avg
> > max       stddev
> >                                      (msec)    (msec)    (msec)
> > (msec)        (%)
> >    --------------- --------  ------ -------- --------- ---------
> > ---------     ------
> >    mq_open              214     71 16073.976     0.000    75.112
> > 250.120      9.91%
> >    futex               1296    195 11592.060     0.000     8.944
> > 907.590     13.59%
> >    epoll_wait           479      0  4262.456     0.000     8.899
> > 496.568     20.34%
> >    poll                 241      0  2545.090     0.000    10.561
> > 607.894     33.33%
> >    ppoll                330      0  1713.676     0.000     5.193
> > 410.143     26.45%
> >    migrate_pages         45      0  1031.915     0.000    22.931
> > 147.830     20.70%
> >    clock_nanosleep        2      0  1000.106     0.000   500.053
> > 1000.106    100.00%
> >    swapoff              340      0   909.827     0.000     2.676
> > 50.117     22.76%
> >    pselect6               5      0   604.816     0.000   120.963
> > 604.808    100.00%
> >    readlinkat            26      3   501.205     0.000    19.277
> > 499.998     99.75%
> >
> > perf $ sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1
> >
> >  Summary of events:
> >
> >  total, 880 events
> >
> >    syscall            calls  errors  total       min       avg
> > max       stddev
> >                                      (msec)    (msec)    (msec)
> > (msec)        (%)
> >    --------------- --------  ------ -------- --------- ---------
> > ---------     ------
> >    futex                219     46  2326.400     0.001    10.623
> > 243.028    337.77%
> >    mq_open               19      8  2001.347     0.003   105.334
> > 250.356    117.26%
> >    poll                   6      1  1002.512     0.002   167.085
> > 1002.496    223.60%
> >    clock_nanosleep        1      0  1000.147  1000.147  1000.147
> > 1000.147      0.00%
> >    swapoff               43      0   953.251     0.001    22.169
> > 50.390    112.37%
> >    migrate_pages         43      0   933.727     0.004    21.715
> > 49.149    106.68%
> >    ppoll                 32      0   838.035     0.002    26.189
> > 331.222    252.10%
> >    epoll_pwait            5      0   499.578     0.001    99.916
> > 499.565    199.99%
> >    nanosleep              1      0    10.149    10.149    10.149
> > 10.149      0.00%
> >    epoll_wait            10      0     3.449     0.003     0.345
> > 0.815     88.02%
> >    readlinkat            25      3     1.424     0.006     0.057
> > 0.080     41.76%
> >    recvmsg               61      0     1.326     0.016     0.022
> > 0.052     21.71%
> >    execve                 6      5     1.100     0.002     0.183
> > 1.078    218.21%
> >
> > I would say stddev here is a little off. The reason is:
> >
> > On Mon, Mar 17, 2025 at 11:08 AM Namhyung Kim <namhyung@kernel.org> wrote:
> > >
> > > When -s/--summary option is used, it doesn't need (augmented) arguments
> > > of syscalls.  Let's skip the augmentation and load another small BPF
> > > program to collect the statistics in the kernel instead of copying the
> > > data to the ring-buffer to calculate the stats in userspace.  This will
> > > be much more light-weight than the existing approach and remove any lost
> > > events.
> > >
> > > Let's add a new option --bpf-summary to control this behavior.  I cannot
> > > make it default because there's no way to get e_machine in the BPF which
> > > is needed for detecting different ABIs like 32-bit compat mode.
> > >
> > > No functional changes intended except for no more LOST events. :)
> > >
> > >   $ sudo perf trace -as --bpf-summary --summary-mode=total -- sleep 1
> > >
> > >    Summary of events:
> > >
> > >    total, 2824 events
> > >
> > >      syscall            calls  errors  total       min       avg       max       stddev
> > >                                        (msec)    (msec)    (msec)    (msec)        (%)
> > >      --------------- --------  ------ -------- --------- --------- ---------     ------
> > >      futex                372     18  4373.773     0.000    11.757   997.715    660.42%
> > >      poll                 241      0  2757.963     0.000    11.444   997.758    580.34%
> > >      epoll_wait           161      0  2460.854     0.000    15.285   325.189    260.73%
> > >      ppoll                 19      0  1298.652     0.000    68.350   667.172    281.46%
> > >      clock_nanosleep        1      0  1000.093     0.000  1000.093  1000.093      0.00%
> > >      epoll_pwait           16      0   192.787     0.000    12.049   173.994    348.73%
> > >      nanosleep              6      0    50.926     0.000     8.488    10.210     43.96%
> > >      ...
> > >
> > > Cc: Howard Chu <howardchu95@gmail.com>
> > > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > > ---
> > > v2)
> > >  * rebased on top of Ian's e_machine changes
> > >  * add --bpf-summary option
> > >  * support per-thread summary
> > >  * add stddev calculation  (Howard)
> > <SNIP>
> > > +static double rel_stddev(struct syscall_stats *stat)
> > > +{
> > > +       double variance, average;
> > > +
> > > +       if (stat->count < 2)
> > > +               return 0;
> > > +
> > > +       average = (double)stat->total_time / stat->count;
> > > +
> > > +       variance = stat->squared_sum;
> > > +       variance -= (stat->total_time * stat->total_time) / stat->count;
> > > +       variance /= stat->count;
> >
> > isn't it 'variance /= stat->count - 1' because we used Bessel's
> > correction? (Link:
> > https://en.wikipedia.org/wiki/Bessel%27s_correction), that is to use n
> > - 1 instead of n, this is what's done in stat.c.
> >
> >  *       (\Sum n_i^2) - ((\Sum n_i)^2)/n
> >  * s^2 = -------------------------------
> >  *                  n - 1
> >
> > and the lines down here are unfortunately incorrect
> > + variance = stat->squared_sum;
> > + variance -= (stat->total_time * stat->total_time) / stat->count;
> > + variance /= stat->count;
> > +
> > + return 100 * sqrt(variance) / average;
> >
> > variance /= stat->count - 1; will get you variance, but I think we
> > need variance mean.
> > Link: https://en.wikipedia.org/wiki/Standard_deviation#Relationship_between_standard_deviation_and_mean
> >
> > it holds that:
> > variance(mean) = variance / N
> >
> > so you are losing a '/ stat->count'
>
> You're right, thanks for pointing that out.
>
> >
> > And with all due respect, although it makes total sense in
> > engineering, mathematically, I find variance = stat->squared_sum,
> > variance -= ... these accumulated calculations on variable 'variance'
> > a little weird... because readers may find difficult to determine at
> > which point it becomes the actual 'variance'
> >
> > with clarity in mind:
> >
> > diff --git a/tools/perf/util/bpf-trace-summary.c
> > b/tools/perf/util/bpf-trace-summary.c
> > index 5ae9feca244d..a435b4037082 100644
> > --- a/tools/perf/util/bpf-trace-summary.c
> > +++ b/tools/perf/util/bpf-trace-summary.c
> > @@ -62,18 +62,18 @@ struct syscall_node {
> >
> >  static double rel_stddev(struct syscall_stats *stat)
> >  {
> > -       double variance, average;
> > +       double variance, average, squared_total;
> >
> >         if (stat->count < 2)
> >                 return 0;
> >
> >         average = (double)stat->total_time / stat->count;
> >
> > -       variance = stat->squared_sum;
> > -       variance -= (stat->total_time * stat->total_time) / stat->count;
> > -       variance /= stat->count;
> > +       squared_total = stat->total_time * stat->total_time;
> > +       variance = (stat->squared_sum - squared_total / stat->count) /
> > (stat->count - 1);
> > +       stddev_mean = sqrt(variance / stat->count);
> >
> > -       return 100 * sqrt(variance) / average;
> > +       return 100 * stddev_mean / average;
> >  }
>
> Can it be like this?
>
> diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
> index a91d42447e850a59..c897fb017914960c 100644
> --- a/tools/perf/util/bpf-trace-summary.c
> +++ b/tools/perf/util/bpf-trace-summary.c
> @@ -71,9 +71,9 @@ static double rel_stddev(struct syscall_stats *stat)
>
>         variance = stat->squared_sum;
>         variance -= (stat->total_time * stat->total_time) / stat->count;
> -       variance /= stat->count;
> +       variance /= stat->count - 1;
>
> -       return 100 * sqrt(variance) / average;
> +       return 100 * sqrt(variance / stat->count) / average;
>  }

Of course. Then the variable 'variance' would mean 'variance of mean'.

>
>  struct syscall_data {
>
> >
> > btw I haven't checked the legal range for stddev_mean, so I can be wrong.
> > <SNIP>
> > > +static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
> > > +                             struct syscall_stats *map_data)
> > > +{
> > > +       struct syscall_data *data;
> > > +       struct syscall_stats *stat;
> > > +
> > > +       if (!hashmap__find(hash, map_key, &data)) {
> > > +               data = zalloc(sizeof(*data));
> > > +               if (data == NULL)
> > > +                       return -ENOMEM;
> > > +
> > > +               data->nodes = zalloc(sizeof(*data->nodes));
> > > +               if (data->nodes == NULL) {
> > > +                       free(data);
> > > +                       return -ENOMEM;
> > > +               }
> > > +
> > > +               data->nr_nodes = 1;
> > > +               data->key = map_key->nr;
> > > +               data->nodes->syscall_nr = data->key;
> > Wow, aggressive. I guess you want it to behave like a single value
> > when it is SYSCALL_AGGR_CPU, and an array when it is
> > SYSCALL_AGGR_THREAD. Do you mind adding a comment about it?
> >
> > so it's
> >
> > (cpu, syscall_nr) -> data -> {node}
> > (tid, syscall_nr) -> data -> [node1, node2, node3]
>
> Right, will add comments.
>
> >
> >
> > > +
> > > +               if (hashmap__add(hash, data->key, data) < 0) {
> > > +                       free(data->nodes);
> > > +                       free(data);
> > > +                       return -ENOMEM;
> > > +               }
> > > +       }
> > > +
> > > +       /* update total stats for this syscall */
> > > +       data->nr_events += map_data->count;
> > > +       data->total_time += map_data->total_time;
> > > +
> > > +       /* This is sum of the same syscall from different CPUs */
> > > +       stat = &data->nodes->stats;
> > > +
> > > +       stat->total_time += map_data->total_time;
> > > +       stat->squared_sum += map_data->squared_sum;
> > > +       stat->count += map_data->count;
> > > +       stat->error += map_data->error;
> > > +
> > > +       if (stat->max_time < map_data->max_time)
> > > +               stat->max_time = map_data->max_time;
> > > +       if (stat->min_time > map_data->min_time)
> > > +               stat->min_time = map_data->min_time;
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
> > > +{
> > > +       int printed = 0;
> > > +       int nr_events = 0;
> > > +
> > > +       for (int i = 0; i < nr_data; i++)
> > > +               nr_events += data[i]->nr_events;
> > > +
> > > +       printed += fprintf(fp, " total, %d events\n\n", nr_events);
> > > +
> > > +       printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
> > > +       printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
> > > +       printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
> > > +
> > > +       for (int i = 0; i < nr_data; i++)
> > > +               printed += print_common_stats(data[i], fp);
> > > +
> > > +       printed += fprintf(fp, "\n\n");
> > > +       return printed;
> > > +}
> > > +
> > > +int trace_print_bpf_summary(FILE *fp)
> > > +{
> > > +       struct bpf_map *map = skel->maps.syscall_stats_map;
> > > +       struct syscall_key *prev_key, key;
> > > +       struct syscall_data **data = NULL;
> > > +       struct hashmap schash;
> > > +       struct hashmap_entry *entry;
> > > +       int nr_data = 0;
> > > +       int printed = 0;
> > > +       int i;
> > > +       size_t bkt;
> > > +
> > > +       hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
> > > +
> > > +       printed = fprintf(fp, "\n Summary of events:\n\n");
> > > +
> > > +       /* get stats from the bpf map */
> > > +       prev_key = NULL;
> > > +       while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
> > > +               struct syscall_stats stat;
> > > +
> > > +               if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
> > > +                       if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> > > +                               update_thread_stats(&schash, &key, &stat);
> > > +                       else
> > > +                               update_total_stats(&schash, &key, &stat);
> > > +               }
> > > +
> > > +               prev_key = &key;
> > > +       }
> > > +
> > > +       nr_data = hashmap__size(&schash);
> > > +       data = calloc(nr_data, sizeof(*data));
> > > +       if (data == NULL)
> > > +               goto out;
> > > +
> > > +       i = 0;
> > > +       hashmap__for_each_entry(&schash, entry, bkt)
> > > +               data[i++] = entry->pvalue;
> > > +
> > > +       qsort(data, nr_data, sizeof(*data), datacmp);
> >
> > Here syscall_data is sorted for AGGR_THREAD and AGGR_CPU, meaning the
> > thread who has the higher total syscall period will be printed first.
> > This is an awesome side effect but it is not the behavior of 'sudo
> > ./perf trace -as -- sleep 1' without the --bpf-summary option. If it
> > is not too trivial, maybe consider documenting this behavior? But it
> > may be too verbose so Idk.
>
> The original behavior is random ordering and now it's ordered by total
> time.  But still we can think it's randomly ordered.  I believe we need
> to maintain strict ordering by total time and then fixed the existing
> code.  Once that happen I can add the documentation.

ofc.

>
> Thanks for your careful review!

You are welcome.

> Namhyung

Thanks,
Howard
Re: [PATCH v2] perf trace: Implement syscall summary in BPF
Posted by Howard Chu 9 months ago
Hello Namhyung,

Can you please rebase it? I cannot apply it, getting:

perf $ git apply --reject --whitespace=fix
./v2_20250317_namhyung_perf_trace_implement_syscall_summary_in_bpf.mbx
Checking patch tools/perf/Documentation/perf-trace.txt...
Checking patch tools/perf/Makefile.perf...
Hunk #1 succeeded at 1198 (offset -8 lines).
Checking patch tools/perf/builtin-trace.c...
error: while searching for:
        bool       hexret;
};

enum summary_mode {
        SUMMARY__NONE = 0,
        SUMMARY__BY_TOTAL,
        SUMMARY__BY_THREAD,
};

struct trace {
        struct perf_tool        tool;
        struct {

error: patch failed: tools/perf/builtin-trace.c:140

Sorry.

Thanks,
Howard
Re: [PATCH v2] perf trace: Implement syscall summary in BPF
Posted by Namhyung Kim 9 months ago
Hello Howard,

On Wed, Mar 19, 2025 at 12:00:10PM -0700, Howard Chu wrote:
> Hello Namhyung,
> 
> Can you please rebase it? I cannot apply it, getting:
> 
> perf $ git apply --reject --whitespace=fix
> ./v2_20250317_namhyung_perf_trace_implement_syscall_summary_in_bpf.mbx
> Checking patch tools/perf/Documentation/perf-trace.txt...
> Checking patch tools/perf/Makefile.perf...
> Hunk #1 succeeded at 1198 (offset -8 lines).
> Checking patch tools/perf/builtin-trace.c...
> error: while searching for:
>         bool       hexret;
> };
> 
> enum summary_mode {
>         SUMMARY__NONE = 0,
>         SUMMARY__BY_TOTAL,
>         SUMMARY__BY_THREAD,
> };
> 
> struct trace {
>         struct perf_tool        tool;
>         struct {
> 
> error: patch failed: tools/perf/builtin-trace.c:140

Oops, I think I forgot to say it's on top of Ian's change.
Please try this first.  Sorry for the confusion.

https://lore.kernel.org/r/20250319050741.269828-1-irogers@google.com

Thanks,
Namhyung
Re: [PATCH v2] perf trace: Implement syscall summary in BPF
Posted by Howard Chu 9 months ago
Hi Namhyung,

On Wed, Mar 19, 2025 at 3:07 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hello Howard,
>
> On Wed, Mar 19, 2025 at 12:00:10PM -0700, Howard Chu wrote:
> > Hello Namhyung,
> >
> > Can you please rebase it? I cannot apply it, getting:
> >
> > perf $ git apply --reject --whitespace=fix
> > ./v2_20250317_namhyung_perf_trace_implement_syscall_summary_in_bpf.mbx
> > Checking patch tools/perf/Documentation/perf-trace.txt...
> > Checking patch tools/perf/Makefile.perf...
> > Hunk #1 succeeded at 1198 (offset -8 lines).
> > Checking patch tools/perf/builtin-trace.c...
> > error: while searching for:
> >         bool       hexret;
> > };
> >
> > enum summary_mode {
> >         SUMMARY__NONE = 0,
> >         SUMMARY__BY_TOTAL,
> >         SUMMARY__BY_THREAD,
> > };
> >
> > struct trace {
> >         struct perf_tool        tool;
> >         struct {
> >
> > error: patch failed: tools/perf/builtin-trace.c:140
>
> Oops, I think I forgot to say it's on top of Ian's change.
> Please try this first.  Sorry for the confusion.
>
> https://lore.kernel.org/r/20250319050741.269828-1-irogers@google.com

Yep, with Ian's patches it successfully applied. :)

Thanks,
Howard
>
> Thanks,
> Namhyung
>
Re: [PATCH v2] perf trace: Implement syscall summary in BPF
Posted by Howard Chu 9 months ago
Hi Namhyung,

I haven't finished the code review yet, but here are something that I
found interesting to share:

## 1
You used sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1 as
an example

If I use perf trace --bpf-summary without the '-a', that is to record
the process / task of 'sleep 1':

sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1

It won't be recording this one process. So there should be a sentence
saying that bpf-summary only does system wide summaries.

## 2
there is a bug in min section, which made min always 0

you can see it in the sample output you provided above:
     syscall            calls  errors  total       min       avg
max       stddev
                                       (msec)    (msec)    (msec)
(msec)        (%)
     --------------- --------  ------ -------- --------- ---------
---------     ------
     futex                372     18  4373.773     0.000    11.757
997.715    660.42%
     poll                 241      0  2757.963     0.000    11.444
997.758    580.34%
     epoll_wait           161      0  2460.854     0.000    15.285
325.189    260.73%
     ppoll                 19      0  1298.652     0.000    68.350
667.172    281.46%
     clock_nanosleep        1      0  1000.093     0.000  1000.093
1000.093      0.00%
     epoll_pwait           16      0   192.787     0.000    12.049
173.994    348.73%
     nanosleep              6      0    50.926     0.000     8.488
10.210     43.96%

clock_nanosleep has only 1 call so min can never be 0, it has to be
equal to the max and the mean.

This can be resolved by adding this line (same as what you did in the BPF code):

diff --git a/tools/perf/util/bpf-trace-summary.c
b/tools/perf/util/bpf-trace-summary.c
index 5ae9feca244d..eb98db7d6e33 100644
--- a/tools/perf/util/bpf-trace-summary.c
+++ b/tools/perf/util/bpf-trace-summary.c
@@ -243,7 +243,7 @@ static int update_total_stats(struct hashmap
*hash, struct syscall_key *map_key,

  if (stat->max_time < map_data->max_time)
  stat->max_time = map_data->max_time;
- if (stat->min_time > map_data->min_time)
+ if (stat->min_time > map_data->min_time || !stat->min_time)
  stat->min_time = map_data->min_time;

  return 0;

(sorry for the poor formatting from the gmail browser app)

## 3
Apologies for misunderstanding how the calculation of the 'standard
deviation of mean' works. You can decide what to do with it. :) Thanks
for the explanation in the thread of the previous version.

Thanks,
Howard

On Wed, Mar 19, 2025 at 3:19 PM Howard Chu <howardchu95@gmail.com> wrote:
>
> Hi Namhyung,
>
> On Wed, Mar 19, 2025 at 3:07 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > Hello Howard,
> >
> > On Wed, Mar 19, 2025 at 12:00:10PM -0700, Howard Chu wrote:
> > > Hello Namhyung,
> > >
> > > Can you please rebase it? I cannot apply it, getting:
> > >
> > > perf $ git apply --reject --whitespace=fix
> > > ./v2_20250317_namhyung_perf_trace_implement_syscall_summary_in_bpf.mbx
> > > Checking patch tools/perf/Documentation/perf-trace.txt...
> > > Checking patch tools/perf/Makefile.perf...
> > > Hunk #1 succeeded at 1198 (offset -8 lines).
> > > Checking patch tools/perf/builtin-trace.c...
> > > error: while searching for:
> > >         bool       hexret;
> > > };
> > >
> > > enum summary_mode {
> > >         SUMMARY__NONE = 0,
> > >         SUMMARY__BY_TOTAL,
> > >         SUMMARY__BY_THREAD,
> > > };
> > >
> > > struct trace {
> > >         struct perf_tool        tool;
> > >         struct {
> > >
> > > error: patch failed: tools/perf/builtin-trace.c:140
> >
> > Oops, I think I forgot to say it's on top of Ian's change.
> > Please try this first.  Sorry for the confusion.
> >
> > https://lore.kernel.org/r/20250319050741.269828-1-irogers@google.com
>
> Yep, with Ian's patches it successfully applied. :)
>
> Thanks,
> Howard
> >
> > Thanks,
> > Namhyung
> >
Re: [PATCH v2] perf trace: Implement syscall summary in BPF
Posted by Namhyung Kim 9 months ago
On Wed, Mar 19, 2025 at 04:39:17PM -0700, Howard Chu wrote:
> Hi Namhyung,
> 
> I haven't finished the code review yet, but here are something that I
> found interesting to share:

Thanks a lot for your review!

> 
> ## 1
> You used sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1 as
> an example
> 
> If I use perf trace --bpf-summary without the '-a', that is to record
> the process / task of 'sleep 1':
> 
> sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
> 
> It won't be recording this one process. So there should be a sentence
> saying that bpf-summary only does system wide summaries.

Hmm.. you're right.  I added per-thread summary but it still works for
system-wide only.  I'll check the target and make it fail with an error
message if it's not in system-wide mode for now.  I think we can add
support for other targets later.

> 
> ## 2
> there is a bug in min section, which made min always 0
> 
> you can see it in the sample output you provided above:
>      syscall            calls  errors  total       min       avg
> max       stddev
>                                        (msec)    (msec)    (msec)
> (msec)        (%)
>      --------------- --------  ------ -------- --------- ---------
> ---------     ------
>      futex                372     18  4373.773     0.000    11.757
> 997.715    660.42%
>      poll                 241      0  2757.963     0.000    11.444
> 997.758    580.34%
>      epoll_wait           161      0  2460.854     0.000    15.285
> 325.189    260.73%
>      ppoll                 19      0  1298.652     0.000    68.350
> 667.172    281.46%
>      clock_nanosleep        1      0  1000.093     0.000  1000.093
> 1000.093      0.00%
>      epoll_pwait           16      0   192.787     0.000    12.049
> 173.994    348.73%
>      nanosleep              6      0    50.926     0.000     8.488
> 10.210     43.96%
> 
> clock_nanosleep has only 1 call so min can never be 0, it has to be
> equal to the max and the mean.

Oops, right.

> 
> This can be resolved by adding this line (same as what you did in the BPF code):
> 
> diff --git a/tools/perf/util/bpf-trace-summary.c
> b/tools/perf/util/bpf-trace-summary.c
> index 5ae9feca244d..eb98db7d6e33 100644
> --- a/tools/perf/util/bpf-trace-summary.c
> +++ b/tools/perf/util/bpf-trace-summary.c
> @@ -243,7 +243,7 @@ static int update_total_stats(struct hashmap
> *hash, struct syscall_key *map_key,
> 
>   if (stat->max_time < map_data->max_time)
>   stat->max_time = map_data->max_time;
> - if (stat->min_time > map_data->min_time)
> + if (stat->min_time > map_data->min_time || !stat->min_time)
>   stat->min_time = map_data->min_time;
> 
>   return 0;
> 
> (sorry for the poor formatting from the gmail browser app)

No problem, I'll add this change.

> 
> ## 3
> Apologies for misunderstanding how the calculation of the 'standard
> deviation of mean' works. You can decide what to do with it. :) Thanks
> for the explanation in the thread of the previous version.

No, it turns out that it can be calculated easily with the squared sum.
So I've added it.

Thanks,
Namhyung

> 
> Thanks,
> Howard
> 
> On Wed, Mar 19, 2025 at 3:19 PM Howard Chu <howardchu95@gmail.com> wrote:
> >
> > Hi Namhyung,
> >
> > On Wed, Mar 19, 2025 at 3:07 PM Namhyung Kim <namhyung@kernel.org> wrote:
> > >
> > > Hello Howard,
> > >
> > > On Wed, Mar 19, 2025 at 12:00:10PM -0700, Howard Chu wrote:
> > > > Hello Namhyung,
> > > >
> > > > Can you please rebase it? I cannot apply it, getting:
> > > >
> > > > perf $ git apply --reject --whitespace=fix
> > > > ./v2_20250317_namhyung_perf_trace_implement_syscall_summary_in_bpf.mbx
> > > > Checking patch tools/perf/Documentation/perf-trace.txt...
> > > > Checking patch tools/perf/Makefile.perf...
> > > > Hunk #1 succeeded at 1198 (offset -8 lines).
> > > > Checking patch tools/perf/builtin-trace.c...
> > > > error: while searching for:
> > > >         bool       hexret;
> > > > };
> > > >
> > > > enum summary_mode {
> > > >         SUMMARY__NONE = 0,
> > > >         SUMMARY__BY_TOTAL,
> > > >         SUMMARY__BY_THREAD,
> > > > };
> > > >
> > > > struct trace {
> > > >         struct perf_tool        tool;
> > > >         struct {
> > > >
> > > > error: patch failed: tools/perf/builtin-trace.c:140
> > >
> > > Oops, I think I forgot to say it's on top of Ian's change.
> > > Please try this first.  Sorry for the confusion.
> > >
> > > https://lore.kernel.org/r/20250319050741.269828-1-irogers@google.com
> >
> > Yep, with Ian's patches it successfully applied. :)
> >
> > Thanks,
> > Howard
> > >
> > > Thanks,
> > > Namhyung
> > >
Re: [PATCH v2] perf trace: Implement syscall summary in BPF
Posted by Ian Rogers 9 months ago
On Mon, Mar 17, 2025 at 11:08 AM Namhyung Kim <namhyung@kernel.org> wrote:
>
> When -s/--summary option is used, it doesn't need (augmented) arguments
> of syscalls.  Let's skip the augmentation and load another small BPF
> program to collect the statistics in the kernel instead of copying the
> data to the ring-buffer to calculate the stats in userspace.  This will
> be much more light-weight than the existing approach and remove any lost
> events.
>
> Let's add a new option --bpf-summary to control this behavior.  I cannot
> make it default because there's no way to get e_machine in the BPF which
> is needed for detecting different ABIs like 32-bit compat mode.
>
> No functional changes intended except for no more LOST events. :)
>
>   $ sudo perf trace -as --bpf-summary --summary-mode=total -- sleep 1
>
>    Summary of events:
>
>    total, 2824 events
>
>      syscall            calls  errors  total       min       avg       max       stddev
>                                        (msec)    (msec)    (msec)    (msec)        (%)
>      --------------- --------  ------ -------- --------- --------- ---------     ------
>      futex                372     18  4373.773     0.000    11.757   997.715    660.42%
>      poll                 241      0  2757.963     0.000    11.444   997.758    580.34%
>      epoll_wait           161      0  2460.854     0.000    15.285   325.189    260.73%
>      ppoll                 19      0  1298.652     0.000    68.350   667.172    281.46%
>      clock_nanosleep        1      0  1000.093     0.000  1000.093  1000.093      0.00%
>      epoll_pwait           16      0   192.787     0.000    12.049   173.994    348.73%
>      nanosleep              6      0    50.926     0.000     8.488    10.210     43.96%
>      ...
>
> Cc: Howard Chu <howardchu95@gmail.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> v2)
>  * rebased on top of Ian's e_machine changes
>  * add --bpf-summary option
>  * support per-thread summary
>  * add stddev calculation  (Howard)
>
>  tools/perf/Documentation/perf-trace.txt       |   6 +
>  tools/perf/Makefile.perf                      |   2 +-
>  tools/perf/builtin-trace.c                    |  43 ++-
>  tools/perf/util/Build                         |   1 +
>  tools/perf/util/bpf-trace-summary.c           | 334 ++++++++++++++++++
>  .../perf/util/bpf_skel/syscall_summary.bpf.c  | 118 +++++++
>  tools/perf/util/bpf_skel/syscall_summary.h    |  25 ++
>  tools/perf/util/trace.h                       |  37 ++
>  8 files changed, 553 insertions(+), 13 deletions(-)
>  create mode 100644 tools/perf/util/bpf-trace-summary.c
>  create mode 100644 tools/perf/util/bpf_skel/syscall_summary.bpf.c
>  create mode 100644 tools/perf/util/bpf_skel/syscall_summary.h
>  create mode 100644 tools/perf/util/trace.h
>
> diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> index 887dc37773d0f4d6..a8a0d8c33438fef7 100644
> --- a/tools/perf/Documentation/perf-trace.txt
> +++ b/tools/perf/Documentation/perf-trace.txt
> @@ -251,6 +251,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
>         pretty-printing serves as a fallback to hand-crafted pretty printers, as the latter can
>         better pretty-print integer flags and struct pointers.
>
> +--bpf-summary::
> +       Collect system call statistics in BPF.  This is only for live mode and
> +       works well with -s/--summary option where no argument information is
> +       required.
> +
> +
>  PAGEFAULTS
>  ----------
>
> diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
> index f3cd8de15d1a2681..d7a7e0c68fc10b8b 100644
> --- a/tools/perf/Makefile.perf
> +++ b/tools/perf/Makefile.perf
> @@ -1206,7 +1206,7 @@ SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
>  SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
>  SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
>  SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
> -SKELETONS += $(SKEL_OUT)/kwork_top.skel.h
> +SKELETONS += $(SKEL_OUT)/kwork_top.skel.h $(SKEL_OUT)/syscall_summary.skel.h
>  SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
>  SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h
>
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index 7fef59306db2891f..deeb7250e8c52354 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -55,6 +55,7 @@
>  #include "util/thread_map.h"
>  #include "util/stat.h"
>  #include "util/tool.h"
> +#include "util/trace.h"
>  #include "util/util.h"
>  #include "trace/beauty/beauty.h"
>  #include "trace-event.h"
> @@ -141,12 +142,6 @@ struct syscall_fmt {
>         bool       hexret;
>  };
>
> -enum summary_mode {
> -       SUMMARY__NONE = 0,
> -       SUMMARY__BY_TOTAL,
> -       SUMMARY__BY_THREAD,
> -};
> -
>  struct trace {
>         struct perf_tool        tool;
>         struct {
> @@ -205,7 +200,7 @@ struct trace {
>         } stats;
>         unsigned int            max_stack;
>         unsigned int            min_stack;
> -       enum summary_mode       summary_mode;
> +       enum trace_summary_mode summary_mode;
>         int                     raw_augmented_syscalls_args_size;
>         bool                    raw_augmented_syscalls;
>         bool                    fd_path_disabled;
> @@ -234,6 +229,7 @@ struct trace {
>         bool                    force;
>         bool                    vfs_getname;
>         bool                    force_btf;
> +       bool                    summary_bpf;
>         int                     trace_pgfaults;
>         char                    *perfconfig_events;
>         struct {
> @@ -4356,6 +4352,13 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>
>         trace->live = true;
>
> +       if (trace->summary_bpf) {
> +               if (trace_prepare_bpf_summary(trace->summary_mode) < 0)
> +                       goto out_delete_evlist;
> +
> +               goto create_maps;
> +       }
> +
>         if (!trace->raw_augmented_syscalls) {
>                 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
>                         goto out_error_raw_syscalls;
> @@ -4414,6 +4417,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>         if (trace->cgroup)
>                 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
>
> +create_maps:
>         err = evlist__create_maps(evlist, &trace->opts.target);
>         if (err < 0) {
>                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
> @@ -4426,7 +4430,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>                 goto out_delete_evlist;
>         }
>
> -       if (trace->summary_mode == SUMMARY__BY_TOTAL) {
> +       if (trace->summary_mode == SUMMARY__BY_TOTAL && !trace->summary_bpf) {
>                 trace->syscall_stats = alloc_syscall_stats();
>                 if (trace->syscall_stats == NULL)
>                         goto out_delete_evlist;
> @@ -4512,9 +4516,11 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>         if (err < 0)
>                 goto out_error_apply_filters;
>
> -       err = evlist__mmap(evlist, trace->opts.mmap_pages);
> -       if (err < 0)
> -               goto out_error_mmap;
> +       if (!trace->summary_bpf) {
> +               err = evlist__mmap(evlist, trace->opts.mmap_pages);
> +               if (err < 0)
> +                       goto out_error_mmap;
> +       }
>
>         if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
>                 evlist__enable(evlist);
> @@ -4527,6 +4533,9 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>                 evlist__enable(evlist);
>         }
>
> +       if (trace->summary_bpf)
> +               trace_start_bpf_summary();
> +
>         trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
>                 perf_thread_map__nr(evlist->core.threads) > 1 ||
>                 evlist__first(evlist)->core.attr.inherit;
> @@ -4594,12 +4603,17 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>
>         evlist__disable(evlist);
>
> +       if (trace->summary_bpf)
> +               trace_end_bpf_summary();
> +
>         if (trace->sort_events)
>                 ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
>
>         if (!err) {
>                 if (trace->summary) {
> -                       if (trace->summary_mode == SUMMARY__BY_TOTAL)
> +                       if (trace->summary_bpf)
> +                               trace_print_bpf_summary(trace->output);
> +                       else if (trace->summary_mode == SUMMARY__BY_TOTAL)
>                                 trace__fprintf_total_summary(trace, trace->output);
>                         else
>                                 trace__fprintf_thread_summary(trace, trace->output);
> @@ -4615,6 +4629,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>         }
>
>  out_delete_evlist:
> +       trace_cleanup_bpf_summary();
>         delete_syscall_stats(trace->syscall_stats);
>         trace__symbols__exit(trace);
>         evlist__free_syscall_tp_fields(evlist);
> @@ -5444,6 +5459,7 @@ int cmd_trace(int argc, const char **argv)
>                      "start"),
>         OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
>                        "to customized ones"),
> +       OPT_BOOLEAN(0, "bpf-summary", &trace.summary_bpf, "Summary syscall stats in BPF"),
>         OPTS_EVSWITCH(&trace.evswitch),
>         OPT_END()
>         };
> @@ -5535,6 +5551,9 @@ int cmd_trace(int argc, const char **argv)
>                 goto skip_augmentation;
>         }
>
> +       if (trace.summary_only && trace.summary_bpf)
> +               goto skip_augmentation;
> +
>         trace.skel = augmented_raw_syscalls_bpf__open();
>         if (!trace.skel) {
>                 pr_debug("Failed to open augmented syscalls BPF skeleton");
> diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> index 034a6603d5a8e8b0..ba4201a6f3c69753 100644
> --- a/tools/perf/util/Build
> +++ b/tools/perf/util/Build
> @@ -171,6 +171,7 @@ perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf_off_cpu.o
>  perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter.o
>  perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-flex.o
>  perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-bison.o
> +perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-trace-summary.o
>  perf-util-$(CONFIG_PERF_BPF_SKEL) += btf.o
>
>  ifeq ($(CONFIG_LIBTRACEEVENT),y)
> diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
> new file mode 100644
> index 0000000000000000..5ae9feca244d5b22
> --- /dev/null
> +++ b/tools/perf/util/bpf-trace-summary.c
> @@ -0,0 +1,334 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#include <inttypes.h>
> +#include <math.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +#include "dwarf-regs.h" /* for EM_HOST */
> +#include "syscalltbl.h"
> +#include "util/hashmap.h"
> +#include "util/trace.h"
> +#include "util/util.h"
> +#include <bpf/bpf.h>
> +#include <linux/time64.h>
> +#include <tools/libc_compat.h> /* reallocarray */
> +
> +#include "bpf_skel/syscall_summary.h"
> +#include "bpf_skel/syscall_summary.skel.h"
> +
> +
> +static struct syscall_summary_bpf *skel;
> +
> +int trace_prepare_bpf_summary(enum trace_summary_mode mode)
> +{
> +       skel = syscall_summary_bpf__open();
> +       if (skel == NULL) {
> +               fprintf(stderr, "failed to open syscall summary bpf skeleton\n");
> +               return -1;
> +       }
> +
> +       if (mode == SUMMARY__BY_THREAD)
> +               skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
> +       else
> +               skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
> +
> +       if (syscall_summary_bpf__load(skel) < 0) {
> +               fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
> +               return -1;
> +       }
> +
> +       if (syscall_summary_bpf__attach(skel) < 0) {
> +               fprintf(stderr, "failed to attach syscall summary bpf skeleton\n");
> +               return -1;
> +       }
> +
> +       return 0;
> +}
> +
> +void trace_start_bpf_summary(void)
> +{
> +       skel->bss->enabled = 1;
> +}
> +
> +void trace_end_bpf_summary(void)
> +{
> +       skel->bss->enabled = 0;
> +}
> +
> +struct syscall_node {
> +       int syscall_nr;
> +       struct syscall_stats stats;
> +};
> +
> +static double rel_stddev(struct syscall_stats *stat)
> +{
> +       double variance, average;
> +
> +       if (stat->count < 2)
> +               return 0;
> +
> +       average = (double)stat->total_time / stat->count;
> +
> +       variance = stat->squared_sum;
> +       variance -= (stat->total_time * stat->total_time) / stat->count;
> +       variance /= stat->count;
> +
> +       return 100 * sqrt(variance) / average;
> +}
> +
> +struct syscall_data {
> +       int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
> +       int nr_events;
> +       int nr_nodes;
> +       u64 total_time;
> +       struct syscall_node *nodes;
> +};
> +
> +static int datacmp(const void *a, const void *b)
> +{
> +       const struct syscall_data * const *sa = a;
> +       const struct syscall_data * const *sb = b;
> +
> +       return (*sa)->total_time > (*sb)->total_time ? -1 : 1;
> +}
> +
> +static int nodecmp(const void *a, const void *b)
> +{
> +       const struct syscall_node *na = a;
> +       const struct syscall_node *nb = b;
> +
> +       return na->stats.total_time > nb->stats.total_time ? -1 : 1;
> +}
> +
> +static size_t sc_node_hash(long key, void *ctx __maybe_unused)
> +{
> +       return key;
> +}
> +
> +static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused)
> +{
> +       return key1 == key2;
> +}
> +
> +static int print_common_stats(struct syscall_data *data, FILE *fp)
> +{
> +       int printed = 0;
> +
> +       for (int i = 0; i < data->nr_nodes; i++) {
> +               struct syscall_node *node = &data->nodes[i];
> +               struct syscall_stats *stat = &node->stats;
> +               double total = (double)(stat->total_time) / NSEC_PER_MSEC;
> +               double min = (double)(stat->min_time) / NSEC_PER_MSEC;
> +               double max = (double)(stat->max_time) / NSEC_PER_MSEC;
> +               double avg = total / stat->count;
> +               const char *name;
> +
> +               /* TODO: support other ABIs */
> +               name = syscalltbl__name(EM_HOST, node->syscall_nr);
> +               if (name)
> +                       printed += fprintf(fp, "   %-15s", name);
> +               else
> +                       printed += fprintf(fp, "   syscall:%-7d", node->syscall_nr);
> +
> +               printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n",
> +                                  stat->count, stat->error, total, min, avg, max,
> +                                  rel_stddev(stat));
> +       }
> +       return printed;
> +}
> +
> +static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key,
> +                              struct syscall_stats *map_data)
> +{
> +       struct syscall_data *data;
> +       struct syscall_node *nodes;
> +
> +       if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) {
> +               data = zalloc(sizeof(*data));
> +               if (data == NULL)
> +                       return -ENOMEM;
> +
> +               data->key = map_key->cpu_or_tid;
> +               if (hashmap__add(hash, data->key, data) < 0) {
> +                       free(data);
> +                       return -ENOMEM;
> +               }
> +       }
> +
> +       /* update thread total stats */
> +       data->nr_events += map_data->count;
> +       data->total_time += map_data->total_time;
> +
> +       nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
> +       if (nodes == NULL)
> +               return -ENOMEM;
> +
> +       data->nodes = nodes;
> +       nodes = &data->nodes[data->nr_nodes++];
> +       nodes->syscall_nr = map_key->nr;
> +
> +       /* each thread has an entry for each syscall, just use the stat */
> +       memcpy(&nodes->stats, map_data, sizeof(*map_data));
> +       return 0;
> +}
> +
> +static int print_thread_stat(struct syscall_data *data, FILE *fp)
> +{
> +       int printed = 0;
> +
> +       qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
> +
> +       printed += fprintf(fp, " thread (%d), ", data->key);
> +       printed += fprintf(fp, "%d events\n\n", data->nr_events);
> +
> +       printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
> +       printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
> +       printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");

The code lgtm and follows the code base's conventions but it seems odd
to put very specific output like this inside of util/ rather than
builtin-trace.c. stat-display.c is similar, it just seems the boundary
between util/ and the buitin-*.c files is blurry.

Thanks,
Ian

> +
> +       printed += print_common_stats(data, fp);
> +       printed += fprintf(fp, "\n\n");
> +
> +       return printed;
> +}
> +
> +static int print_thread_stats(struct syscall_data **data, int nr_data, FILE *fp)
> +{
> +       int printed = 0;
> +
> +       for (int i = 0; i < nr_data; i++)
> +               printed += print_thread_stat(data[i], fp);
> +
> +       return printed;
> +}
> +
> +static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
> +                             struct syscall_stats *map_data)
> +{
> +       struct syscall_data *data;
> +       struct syscall_stats *stat;
> +
> +       if (!hashmap__find(hash, map_key, &data)) {
> +               data = zalloc(sizeof(*data));
> +               if (data == NULL)
> +                       return -ENOMEM;
> +
> +               data->nodes = zalloc(sizeof(*data->nodes));
> +               if (data->nodes == NULL) {
> +                       free(data);
> +                       return -ENOMEM;
> +               }
> +
> +               data->nr_nodes = 1;
> +               data->key = map_key->nr;
> +               data->nodes->syscall_nr = data->key;
> +
> +               if (hashmap__add(hash, data->key, data) < 0) {
> +                       free(data->nodes);
> +                       free(data);
> +                       return -ENOMEM;
> +               }
> +       }
> +
> +       /* update total stats for this syscall */
> +       data->nr_events += map_data->count;
> +       data->total_time += map_data->total_time;
> +
> +       /* This is sum of the same syscall from different CPUs */
> +       stat = &data->nodes->stats;
> +
> +       stat->total_time += map_data->total_time;
> +       stat->squared_sum += map_data->squared_sum;
> +       stat->count += map_data->count;
> +       stat->error += map_data->error;
> +
> +       if (stat->max_time < map_data->max_time)
> +               stat->max_time = map_data->max_time;
> +       if (stat->min_time > map_data->min_time)
> +               stat->min_time = map_data->min_time;
> +
> +       return 0;
> +}
> +
> +static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
> +{
> +       int printed = 0;
> +       int nr_events = 0;
> +
> +       for (int i = 0; i < nr_data; i++)
> +               nr_events += data[i]->nr_events;
> +
> +       printed += fprintf(fp, " total, %d events\n\n", nr_events);
> +
> +       printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
> +       printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
> +       printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
> +
> +       for (int i = 0; i < nr_data; i++)
> +               printed += print_common_stats(data[i], fp);
> +
> +       printed += fprintf(fp, "\n\n");
> +       return printed;
> +}
> +
> +int trace_print_bpf_summary(FILE *fp)
> +{
> +       struct bpf_map *map = skel->maps.syscall_stats_map;
> +       struct syscall_key *prev_key, key;
> +       struct syscall_data **data = NULL;
> +       struct hashmap schash;
> +       struct hashmap_entry *entry;
> +       int nr_data = 0;
> +       int printed = 0;
> +       int i;
> +       size_t bkt;
> +
> +       hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
> +
> +       printed = fprintf(fp, "\n Summary of events:\n\n");
> +
> +       /* get stats from the bpf map */
> +       prev_key = NULL;
> +       while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
> +               struct syscall_stats stat;
> +
> +               if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
> +                       if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> +                               update_thread_stats(&schash, &key, &stat);
> +                       else
> +                               update_total_stats(&schash, &key, &stat);
> +               }
> +
> +               prev_key = &key;
> +       }
> +
> +       nr_data = hashmap__size(&schash);
> +       data = calloc(nr_data, sizeof(*data));
> +       if (data == NULL)
> +               goto out;
> +
> +       i = 0;
> +       hashmap__for_each_entry(&schash, entry, bkt)
> +               data[i++] = entry->pvalue;
> +
> +       qsort(data, nr_data, sizeof(*data), datacmp);
> +
> +       if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> +               printed += print_thread_stats(data, nr_data, fp);
> +       else
> +               printed += print_total_stats(data, nr_data, fp);
> +
> +       for (i = 0; i < nr_data && data; i++) {
> +               free(data[i]->nodes);
> +               free(data[i]);
> +       }
> +       free(data);
> +
> +out:
> +       hashmap__clear(&schash);
> +       return printed;
> +}
> +
> +void trace_cleanup_bpf_summary(void)
> +{
> +       syscall_summary_bpf__destroy(skel);
> +}
> diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
> new file mode 100644
> index 0000000000000000..b25f53b3c1351392
> --- /dev/null
> +++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
> @@ -0,0 +1,118 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Trace raw_syscalls tracepoints to collect system call statistics.
> + */
> +
> +#include "vmlinux.h"
> +#include "syscall_summary.h"
> +
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +/* This is to calculate a delta between sys-enter and sys-exit for each thread */
> +struct syscall_trace {
> +       int nr; /* syscall number is only available at sys-enter */
> +       int unused;
> +       u64 timestamp;
> +};
> +
> +#define MAX_ENTRIES    (128 * 1024)
> +
> +struct syscall_trace_map {
> +       __uint(type, BPF_MAP_TYPE_HASH);
> +       __type(key, int); /* tid */
> +       __type(value, struct syscall_trace);
> +       __uint(max_entries, MAX_ENTRIES);
> +} syscall_trace_map SEC(".maps");
> +
> +struct syscall_stats_map {
> +       __uint(type, BPF_MAP_TYPE_HASH);
> +       __type(key, struct syscall_key);
> +       __type(value, struct syscall_stats);
> +       __uint(max_entries, MAX_ENTRIES);
> +} syscall_stats_map SEC(".maps");
> +
> +int enabled; /* controlled from userspace */
> +
> +const volatile enum syscall_aggr_mode aggr_mode;
> +
> +static void update_stats(int cpu_or_tid, int nr, s64 duration, long ret)
> +{
> +       struct syscall_key key = { .cpu_or_tid = cpu_or_tid, .nr = nr, };
> +       struct syscall_stats *stats;
> +
> +       stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
> +       if (stats == NULL) {
> +               struct syscall_stats zero = {};
> +
> +               bpf_map_update_elem(&syscall_stats_map, &key, &zero, BPF_NOEXIST);
> +               stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
> +               if (stats == NULL)
> +                       return;
> +       }
> +
> +       __sync_fetch_and_add(&stats->count, 1);
> +       if (ret < 0)
> +               __sync_fetch_and_add(&stats->error, 1);
> +
> +       if (duration > 0) {
> +               __sync_fetch_and_add(&stats->total_time, duration);
> +               __sync_fetch_and_add(&stats->squared_sum, duration * duration);
> +               if (stats->max_time < duration)
> +                       stats->max_time = duration;
> +               if (stats->min_time > duration || stats->min_time == 0)
> +                       stats->min_time = duration;
> +       }
> +
> +       return;
> +}
> +
> +SEC("tp_btf/sys_enter")
> +int sys_enter(u64 *ctx)
> +{
> +       int tid;
> +       struct syscall_trace st;
> +
> +       if (!enabled)
> +               return 0;
> +
> +       st.nr = ctx[1]; /* syscall number */
> +       st.unused = 0;
> +       st.timestamp = bpf_ktime_get_ns();
> +
> +       tid = bpf_get_current_pid_tgid();
> +       bpf_map_update_elem(&syscall_trace_map, &tid, &st, BPF_ANY);
> +
> +       return 0;
> +}
> +
> +SEC("tp_btf/sys_exit")
> +int sys_exit(u64 *ctx)
> +{
> +       int tid;
> +       int key;
> +       long ret = ctx[1]; /* return value of the syscall */
> +       struct syscall_trace *st;
> +       s64 delta;
> +
> +       if (!enabled)
> +               return 0;
> +
> +       tid = bpf_get_current_pid_tgid();
> +       st = bpf_map_lookup_elem(&syscall_trace_map, &tid);
> +       if (st == NULL)
> +               return 0;
> +
> +       if (aggr_mode == SYSCALL_AGGR_THREAD)
> +               key = tid;
> +       else
> +               key = bpf_get_smp_processor_id();
> +
> +       delta = bpf_ktime_get_ns() - st->timestamp;
> +       update_stats(key, st->nr, delta, ret);
> +
> +       bpf_map_delete_elem(&syscall_trace_map, &tid);
> +       return 0;
> +}
> +
> +char _license[] SEC("license") = "GPL";
> diff --git a/tools/perf/util/bpf_skel/syscall_summary.h b/tools/perf/util/bpf_skel/syscall_summary.h
> new file mode 100644
> index 0000000000000000..17f9ecba657088aa
> --- /dev/null
> +++ b/tools/perf/util/bpf_skel/syscall_summary.h
> @@ -0,0 +1,25 @@
> +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +/* Data structures shared between BPF and tools. */
> +#ifndef UTIL_BPF_SKEL_SYSCALL_SUMMARY_H
> +#define UTIL_BPF_SKEL_SYSCALL_SUMMARY_H
> +
> +enum syscall_aggr_mode {
> +       SYSCALL_AGGR_THREAD,
> +       SYSCALL_AGGR_CPU,
> +};
> +
> +struct syscall_key {
> +       int cpu_or_tid;
> +       int nr;
> +};
> +
> +struct syscall_stats {
> +       u64 total_time;
> +       u64 squared_sum;
> +       u64 max_time;
> +       u64 min_time;
> +       u32 count;
> +       u32 error;
> +};
> +
> +#endif /* UTIL_BPF_SKEL_SYSCALL_SUMMARY_H */
> diff --git a/tools/perf/util/trace.h b/tools/perf/util/trace.h
> new file mode 100644
> index 0000000000000000..ef8361ed12c4edc1
> --- /dev/null
> +++ b/tools/perf/util/trace.h
> @@ -0,0 +1,37 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef UTIL_TRACE_H
> +#define UTIL_TRACE_H
> +
> +#include <stdio.h>  /* for FILE */
> +
> +enum trace_summary_mode {
> +       SUMMARY__NONE = 0,
> +       SUMMARY__BY_TOTAL,
> +       SUMMARY__BY_THREAD,
> +};
> +
> +#ifdef HAVE_BPF_SKEL
> +
> +int trace_prepare_bpf_summary(enum trace_summary_mode mode);
> +void trace_start_bpf_summary(void);
> +void trace_end_bpf_summary(void);
> +int trace_print_bpf_summary(FILE *fp);
> +void trace_cleanup_bpf_summary(void);
> +
> +#else /* !HAVE_BPF_SKEL */
> +
> +static inline int trace_prepare_bpf_summary(enum trace_summary_mode mode __maybe_unused)
> +{
> +       return -1;
> +}
> +static inline void trace_start_bpf_summary(void) {}
> +static inline void trace_end_bpf_summary(void) {}
> +static inline int trace_print_bpf_summary(FILE *fp __maybe_unused)
> +{
> +       return 0;
> +}
> +static inline void trace_cleanup_bpf_summary(void) {}
> +
> +#endif /* HAVE_BPF_SKEL */
> +
> +#endif /* UTIL_TRACE_H */
> --
> 2.49.0.rc1.451.g8f38331e32-goog
>
Re: [PATCH v2] perf trace: Implement syscall summary in BPF
Posted by Namhyung Kim 9 months ago
On Mon, Mar 17, 2025 at 11:37:22AM -0700, Ian Rogers wrote:
> On Mon, Mar 17, 2025 at 11:08 AM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > When -s/--summary option is used, it doesn't need (augmented) arguments
> > of syscalls.  Let's skip the augmentation and load another small BPF
> > program to collect the statistics in the kernel instead of copying the
> > data to the ring-buffer to calculate the stats in userspace.  This will
> > be much more light-weight than the existing approach and remove any lost
> > events.
> >
> > Let's add a new option --bpf-summary to control this behavior.  I cannot
> > make it default because there's no way to get e_machine in the BPF which
> > is needed for detecting different ABIs like 32-bit compat mode.
> >
> > No functional changes intended except for no more LOST events. :)
> >
> >   $ sudo perf trace -as --bpf-summary --summary-mode=total -- sleep 1
> >
> >    Summary of events:
> >
> >    total, 2824 events
> >
> >      syscall            calls  errors  total       min       avg       max       stddev
> >                                        (msec)    (msec)    (msec)    (msec)        (%)
> >      --------------- --------  ------ -------- --------- --------- ---------     ------
> >      futex                372     18  4373.773     0.000    11.757   997.715    660.42%
> >      poll                 241      0  2757.963     0.000    11.444   997.758    580.34%
> >      epoll_wait           161      0  2460.854     0.000    15.285   325.189    260.73%
> >      ppoll                 19      0  1298.652     0.000    68.350   667.172    281.46%
> >      clock_nanosleep        1      0  1000.093     0.000  1000.093  1000.093      0.00%
> >      epoll_pwait           16      0   192.787     0.000    12.049   173.994    348.73%
> >      nanosleep              6      0    50.926     0.000     8.488    10.210     43.96%
> >      ...
> >
> > Cc: Howard Chu <howardchu95@gmail.com>
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> > v2)
> >  * rebased on top of Ian's e_machine changes
> >  * add --bpf-summary option
> >  * support per-thread summary
> >  * add stddev calculation  (Howard)
> >
> >  tools/perf/Documentation/perf-trace.txt       |   6 +
> >  tools/perf/Makefile.perf                      |   2 +-
> >  tools/perf/builtin-trace.c                    |  43 ++-
> >  tools/perf/util/Build                         |   1 +
> >  tools/perf/util/bpf-trace-summary.c           | 334 ++++++++++++++++++
> >  .../perf/util/bpf_skel/syscall_summary.bpf.c  | 118 +++++++
> >  tools/perf/util/bpf_skel/syscall_summary.h    |  25 ++
> >  tools/perf/util/trace.h                       |  37 ++
> >  8 files changed, 553 insertions(+), 13 deletions(-)
> >  create mode 100644 tools/perf/util/bpf-trace-summary.c
> >  create mode 100644 tools/perf/util/bpf_skel/syscall_summary.bpf.c
> >  create mode 100644 tools/perf/util/bpf_skel/syscall_summary.h
> >  create mode 100644 tools/perf/util/trace.h
> >
> > diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> > index 887dc37773d0f4d6..a8a0d8c33438fef7 100644
> > --- a/tools/perf/Documentation/perf-trace.txt
> > +++ b/tools/perf/Documentation/perf-trace.txt
> > @@ -251,6 +251,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
> >         pretty-printing serves as a fallback to hand-crafted pretty printers, as the latter can
> >         better pretty-print integer flags and struct pointers.
> >
> > +--bpf-summary::
> > +       Collect system call statistics in BPF.  This is only for live mode and
> > +       works well with -s/--summary option where no argument information is
> > +       required.
> > +
> > +
> >  PAGEFAULTS
> >  ----------
> >
> > diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
> > index f3cd8de15d1a2681..d7a7e0c68fc10b8b 100644
> > --- a/tools/perf/Makefile.perf
> > +++ b/tools/perf/Makefile.perf
> > @@ -1206,7 +1206,7 @@ SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
> >  SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
> >  SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
> >  SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
> > -SKELETONS += $(SKEL_OUT)/kwork_top.skel.h
> > +SKELETONS += $(SKEL_OUT)/kwork_top.skel.h $(SKEL_OUT)/syscall_summary.skel.h
> >  SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
> >  SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h
> >
> > diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> > index 7fef59306db2891f..deeb7250e8c52354 100644
> > --- a/tools/perf/builtin-trace.c
> > +++ b/tools/perf/builtin-trace.c
> > @@ -55,6 +55,7 @@
> >  #include "util/thread_map.h"
> >  #include "util/stat.h"
> >  #include "util/tool.h"
> > +#include "util/trace.h"
> >  #include "util/util.h"
> >  #include "trace/beauty/beauty.h"
> >  #include "trace-event.h"
> > @@ -141,12 +142,6 @@ struct syscall_fmt {
> >         bool       hexret;
> >  };
> >
> > -enum summary_mode {
> > -       SUMMARY__NONE = 0,
> > -       SUMMARY__BY_TOTAL,
> > -       SUMMARY__BY_THREAD,
> > -};
> > -
> >  struct trace {
> >         struct perf_tool        tool;
> >         struct {
> > @@ -205,7 +200,7 @@ struct trace {
> >         } stats;
> >         unsigned int            max_stack;
> >         unsigned int            min_stack;
> > -       enum summary_mode       summary_mode;
> > +       enum trace_summary_mode summary_mode;
> >         int                     raw_augmented_syscalls_args_size;
> >         bool                    raw_augmented_syscalls;
> >         bool                    fd_path_disabled;
> > @@ -234,6 +229,7 @@ struct trace {
> >         bool                    force;
> >         bool                    vfs_getname;
> >         bool                    force_btf;
> > +       bool                    summary_bpf;
> >         int                     trace_pgfaults;
> >         char                    *perfconfig_events;
> >         struct {
> > @@ -4356,6 +4352,13 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> >
> >         trace->live = true;
> >
> > +       if (trace->summary_bpf) {
> > +               if (trace_prepare_bpf_summary(trace->summary_mode) < 0)
> > +                       goto out_delete_evlist;
> > +
> > +               goto create_maps;
> > +       }
> > +
> >         if (!trace->raw_augmented_syscalls) {
> >                 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
> >                         goto out_error_raw_syscalls;
> > @@ -4414,6 +4417,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> >         if (trace->cgroup)
> >                 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
> >
> > +create_maps:
> >         err = evlist__create_maps(evlist, &trace->opts.target);
> >         if (err < 0) {
> >                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
> > @@ -4426,7 +4430,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> >                 goto out_delete_evlist;
> >         }
> >
> > -       if (trace->summary_mode == SUMMARY__BY_TOTAL) {
> > +       if (trace->summary_mode == SUMMARY__BY_TOTAL && !trace->summary_bpf) {
> >                 trace->syscall_stats = alloc_syscall_stats();
> >                 if (trace->syscall_stats == NULL)
> >                         goto out_delete_evlist;
> > @@ -4512,9 +4516,11 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> >         if (err < 0)
> >                 goto out_error_apply_filters;
> >
> > -       err = evlist__mmap(evlist, trace->opts.mmap_pages);
> > -       if (err < 0)
> > -               goto out_error_mmap;
> > +       if (!trace->summary_bpf) {
> > +               err = evlist__mmap(evlist, trace->opts.mmap_pages);
> > +               if (err < 0)
> > +                       goto out_error_mmap;
> > +       }
> >
> >         if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
> >                 evlist__enable(evlist);
> > @@ -4527,6 +4533,9 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> >                 evlist__enable(evlist);
> >         }
> >
> > +       if (trace->summary_bpf)
> > +               trace_start_bpf_summary();
> > +
> >         trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
> >                 perf_thread_map__nr(evlist->core.threads) > 1 ||
> >                 evlist__first(evlist)->core.attr.inherit;
> > @@ -4594,12 +4603,17 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> >
> >         evlist__disable(evlist);
> >
> > +       if (trace->summary_bpf)
> > +               trace_end_bpf_summary();
> > +
> >         if (trace->sort_events)
> >                 ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
> >
> >         if (!err) {
> >                 if (trace->summary) {
> > -                       if (trace->summary_mode == SUMMARY__BY_TOTAL)
> > +                       if (trace->summary_bpf)
> > +                               trace_print_bpf_summary(trace->output);
> > +                       else if (trace->summary_mode == SUMMARY__BY_TOTAL)
> >                                 trace__fprintf_total_summary(trace, trace->output);
> >                         else
> >                                 trace__fprintf_thread_summary(trace, trace->output);
> > @@ -4615,6 +4629,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> >         }
> >
> >  out_delete_evlist:
> > +       trace_cleanup_bpf_summary();
> >         delete_syscall_stats(trace->syscall_stats);
> >         trace__symbols__exit(trace);
> >         evlist__free_syscall_tp_fields(evlist);
> > @@ -5444,6 +5459,7 @@ int cmd_trace(int argc, const char **argv)
> >                      "start"),
> >         OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
> >                        "to customized ones"),
> > +       OPT_BOOLEAN(0, "bpf-summary", &trace.summary_bpf, "Summary syscall stats in BPF"),
> >         OPTS_EVSWITCH(&trace.evswitch),
> >         OPT_END()
> >         };
> > @@ -5535,6 +5551,9 @@ int cmd_trace(int argc, const char **argv)
> >                 goto skip_augmentation;
> >         }
> >
> > +       if (trace.summary_only && trace.summary_bpf)
> > +               goto skip_augmentation;
> > +
> >         trace.skel = augmented_raw_syscalls_bpf__open();
> >         if (!trace.skel) {
> >                 pr_debug("Failed to open augmented syscalls BPF skeleton");
> > diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> > index 034a6603d5a8e8b0..ba4201a6f3c69753 100644
> > --- a/tools/perf/util/Build
> > +++ b/tools/perf/util/Build
> > @@ -171,6 +171,7 @@ perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf_off_cpu.o
> >  perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter.o
> >  perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-flex.o
> >  perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-bison.o
> > +perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-trace-summary.o
> >  perf-util-$(CONFIG_PERF_BPF_SKEL) += btf.o
> >
> >  ifeq ($(CONFIG_LIBTRACEEVENT),y)
> > diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
> > new file mode 100644
> > index 0000000000000000..5ae9feca244d5b22
> > --- /dev/null
> > +++ b/tools/perf/util/bpf-trace-summary.c
> > @@ -0,0 +1,334 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#include <inttypes.h>
> > +#include <math.h>
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +
> > +#include "dwarf-regs.h" /* for EM_HOST */
> > +#include "syscalltbl.h"
> > +#include "util/hashmap.h"
> > +#include "util/trace.h"
> > +#include "util/util.h"
> > +#include <bpf/bpf.h>
> > +#include <linux/time64.h>
> > +#include <tools/libc_compat.h> /* reallocarray */
> > +
> > +#include "bpf_skel/syscall_summary.h"
> > +#include "bpf_skel/syscall_summary.skel.h"
> > +
> > +
> > +static struct syscall_summary_bpf *skel;
> > +
> > +int trace_prepare_bpf_summary(enum trace_summary_mode mode)
> > +{
> > +       skel = syscall_summary_bpf__open();
> > +       if (skel == NULL) {
> > +               fprintf(stderr, "failed to open syscall summary bpf skeleton\n");
> > +               return -1;
> > +       }
> > +
> > +       if (mode == SUMMARY__BY_THREAD)
> > +               skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
> > +       else
> > +               skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
> > +
> > +       if (syscall_summary_bpf__load(skel) < 0) {
> > +               fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
> > +               return -1;
> > +       }
> > +
> > +       if (syscall_summary_bpf__attach(skel) < 0) {
> > +               fprintf(stderr, "failed to attach syscall summary bpf skeleton\n");
> > +               return -1;
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +void trace_start_bpf_summary(void)
> > +{
> > +       skel->bss->enabled = 1;
> > +}
> > +
> > +void trace_end_bpf_summary(void)
> > +{
> > +       skel->bss->enabled = 0;
> > +}
> > +
> > +struct syscall_node {
> > +       int syscall_nr;
> > +       struct syscall_stats stats;
> > +};
> > +
> > +static double rel_stddev(struct syscall_stats *stat)
> > +{
> > +       double variance, average;
> > +
> > +       if (stat->count < 2)
> > +               return 0;
> > +
> > +       average = (double)stat->total_time / stat->count;
> > +
> > +       variance = stat->squared_sum;
> > +       variance -= (stat->total_time * stat->total_time) / stat->count;
> > +       variance /= stat->count;
> > +
> > +       return 100 * sqrt(variance) / average;
> > +}
> > +
> > +struct syscall_data {
> > +       int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
> > +       int nr_events;
> > +       int nr_nodes;
> > +       u64 total_time;
> > +       struct syscall_node *nodes;
> > +};
> > +
> > +static int datacmp(const void *a, const void *b)
> > +{
> > +       const struct syscall_data * const *sa = a;
> > +       const struct syscall_data * const *sb = b;
> > +
> > +       return (*sa)->total_time > (*sb)->total_time ? -1 : 1;
> > +}
> > +
> > +static int nodecmp(const void *a, const void *b)
> > +{
> > +       const struct syscall_node *na = a;
> > +       const struct syscall_node *nb = b;
> > +
> > +       return na->stats.total_time > nb->stats.total_time ? -1 : 1;
> > +}
> > +
> > +static size_t sc_node_hash(long key, void *ctx __maybe_unused)
> > +{
> > +       return key;
> > +}
> > +
> > +static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused)
> > +{
> > +       return key1 == key2;
> > +}
> > +
> > +static int print_common_stats(struct syscall_data *data, FILE *fp)
> > +{
> > +       int printed = 0;
> > +
> > +       for (int i = 0; i < data->nr_nodes; i++) {
> > +               struct syscall_node *node = &data->nodes[i];
> > +               struct syscall_stats *stat = &node->stats;
> > +               double total = (double)(stat->total_time) / NSEC_PER_MSEC;
> > +               double min = (double)(stat->min_time) / NSEC_PER_MSEC;
> > +               double max = (double)(stat->max_time) / NSEC_PER_MSEC;
> > +               double avg = total / stat->count;
> > +               const char *name;
> > +
> > +               /* TODO: support other ABIs */
> > +               name = syscalltbl__name(EM_HOST, node->syscall_nr);
> > +               if (name)
> > +                       printed += fprintf(fp, "   %-15s", name);
> > +               else
> > +                       printed += fprintf(fp, "   syscall:%-7d", node->syscall_nr);
> > +
> > +               printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n",
> > +                                  stat->count, stat->error, total, min, avg, max,
> > +                                  rel_stddev(stat));
> > +       }
> > +       return printed;
> > +}
> > +
> > +static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key,
> > +                              struct syscall_stats *map_data)
> > +{
> > +       struct syscall_data *data;
> > +       struct syscall_node *nodes;
> > +
> > +       if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) {
> > +               data = zalloc(sizeof(*data));
> > +               if (data == NULL)
> > +                       return -ENOMEM;
> > +
> > +               data->key = map_key->cpu_or_tid;
> > +               if (hashmap__add(hash, data->key, data) < 0) {
> > +                       free(data);
> > +                       return -ENOMEM;
> > +               }
> > +       }
> > +
> > +       /* update thread total stats */
> > +       data->nr_events += map_data->count;
> > +       data->total_time += map_data->total_time;
> > +
> > +       nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
> > +       if (nodes == NULL)
> > +               return -ENOMEM;
> > +
> > +       data->nodes = nodes;
> > +       nodes = &data->nodes[data->nr_nodes++];
> > +       nodes->syscall_nr = map_key->nr;
> > +
> > +       /* each thread has an entry for each syscall, just use the stat */
> > +       memcpy(&nodes->stats, map_data, sizeof(*map_data));
> > +       return 0;
> > +}
> > +
> > +static int print_thread_stat(struct syscall_data *data, FILE *fp)
> > +{
> > +       int printed = 0;
> > +
> > +       qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
> > +
> > +       printed += fprintf(fp, " thread (%d), ", data->key);
> > +       printed += fprintf(fp, "%d events\n\n", data->nr_events);
> > +
> > +       printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
> > +       printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
> > +       printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
> 
> The code lgtm and follows the code base's conventions but it seems odd
> to put very specific output like this inside of util/ rather than
> builtin-trace.c. stat-display.c is similar, it just seems the boundary
> between util/ and the buitin-*.c files is blurry.

I see your point and totally agree.  Ideally the print code should be
in the builtin-trace.c and this file contains BPF related parts only.

But it requires more refactoring to clearly separate logic that can be
shared in the util/ code.  Currently this code is large and hard to
factor out some parts.  I think we need to move many code to util/.

Maybe I can work on it in the near future to add more changes to perf
trace.  But for now I'd like to make this change small and independent
as much as possible.  Is it ok to you?

Thanks,
Namhyung