tools/perf/Documentation/perf-trace.txt | 6 + tools/perf/Makefile.perf | 2 +- tools/perf/builtin-trace.c | 43 ++- tools/perf/util/Build | 1 + tools/perf/util/bpf-trace-summary.c | 334 ++++++++++++++++++ .../perf/util/bpf_skel/syscall_summary.bpf.c | 118 +++++++ tools/perf/util/bpf_skel/syscall_summary.h | 25 ++ tools/perf/util/trace.h | 37 ++ 8 files changed, 553 insertions(+), 13 deletions(-) create mode 100644 tools/perf/util/bpf-trace-summary.c create mode 100644 tools/perf/util/bpf_skel/syscall_summary.bpf.c create mode 100644 tools/perf/util/bpf_skel/syscall_summary.h create mode 100644 tools/perf/util/trace.h
When -s/--summary option is used, it doesn't need (augmented) arguments
of syscalls. Let's skip the augmentation and load another small BPF
program to collect the statistics in the kernel instead of copying the
data to the ring-buffer to calculate the stats in userspace. This will
be much more light-weight than the existing approach and remove any lost
events.
Let's add a new option --bpf-summary to control this behavior. I cannot
make it default because there's no way to get e_machine in the BPF which
is needed for detecting different ABIs like 32-bit compat mode.
No functional changes intended except for no more LOST events. :)
$ sudo perf trace -as --bpf-summary --summary-mode=total -- sleep 1
Summary of events:
total, 2824 events
syscall calls errors total min avg max stddev
(msec) (msec) (msec) (msec) (%)
--------------- -------- ------ -------- --------- --------- --------- ------
futex 372 18 4373.773 0.000 11.757 997.715 660.42%
poll 241 0 2757.963 0.000 11.444 997.758 580.34%
epoll_wait 161 0 2460.854 0.000 15.285 325.189 260.73%
ppoll 19 0 1298.652 0.000 68.350 667.172 281.46%
clock_nanosleep 1 0 1000.093 0.000 1000.093 1000.093 0.00%
epoll_pwait 16 0 192.787 0.000 12.049 173.994 348.73%
nanosleep 6 0 50.926 0.000 8.488 10.210 43.96%
...
Cc: Howard Chu <howardchu95@gmail.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
v2)
* rebased on top of Ian's e_machine changes
* add --bpf-summary option
* support per-thread summary
* add stddev calculation (Howard)
tools/perf/Documentation/perf-trace.txt | 6 +
tools/perf/Makefile.perf | 2 +-
tools/perf/builtin-trace.c | 43 ++-
tools/perf/util/Build | 1 +
tools/perf/util/bpf-trace-summary.c | 334 ++++++++++++++++++
.../perf/util/bpf_skel/syscall_summary.bpf.c | 118 +++++++
tools/perf/util/bpf_skel/syscall_summary.h | 25 ++
tools/perf/util/trace.h | 37 ++
8 files changed, 553 insertions(+), 13 deletions(-)
create mode 100644 tools/perf/util/bpf-trace-summary.c
create mode 100644 tools/perf/util/bpf_skel/syscall_summary.bpf.c
create mode 100644 tools/perf/util/bpf_skel/syscall_summary.h
create mode 100644 tools/perf/util/trace.h
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 887dc37773d0f4d6..a8a0d8c33438fef7 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -251,6 +251,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
pretty-printing serves as a fallback to hand-crafted pretty printers, as the latter can
better pretty-print integer flags and struct pointers.
+--bpf-summary::
+ Collect system call statistics in BPF. This is only for live mode and
+ works well with -s/--summary option where no argument information is
+ required.
+
+
PAGEFAULTS
----------
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index f3cd8de15d1a2681..d7a7e0c68fc10b8b 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1206,7 +1206,7 @@ SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
-SKELETONS += $(SKEL_OUT)/kwork_top.skel.h
+SKELETONS += $(SKEL_OUT)/kwork_top.skel.h $(SKEL_OUT)/syscall_summary.skel.h
SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 7fef59306db2891f..deeb7250e8c52354 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -55,6 +55,7 @@
#include "util/thread_map.h"
#include "util/stat.h"
#include "util/tool.h"
+#include "util/trace.h"
#include "util/util.h"
#include "trace/beauty/beauty.h"
#include "trace-event.h"
@@ -141,12 +142,6 @@ struct syscall_fmt {
bool hexret;
};
-enum summary_mode {
- SUMMARY__NONE = 0,
- SUMMARY__BY_TOTAL,
- SUMMARY__BY_THREAD,
-};
-
struct trace {
struct perf_tool tool;
struct {
@@ -205,7 +200,7 @@ struct trace {
} stats;
unsigned int max_stack;
unsigned int min_stack;
- enum summary_mode summary_mode;
+ enum trace_summary_mode summary_mode;
int raw_augmented_syscalls_args_size;
bool raw_augmented_syscalls;
bool fd_path_disabled;
@@ -234,6 +229,7 @@ struct trace {
bool force;
bool vfs_getname;
bool force_btf;
+ bool summary_bpf;
int trace_pgfaults;
char *perfconfig_events;
struct {
@@ -4356,6 +4352,13 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
trace->live = true;
+ if (trace->summary_bpf) {
+ if (trace_prepare_bpf_summary(trace->summary_mode) < 0)
+ goto out_delete_evlist;
+
+ goto create_maps;
+ }
+
if (!trace->raw_augmented_syscalls) {
if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
goto out_error_raw_syscalls;
@@ -4414,6 +4417,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
if (trace->cgroup)
evlist__set_default_cgroup(trace->evlist, trace->cgroup);
+create_maps:
err = evlist__create_maps(evlist, &trace->opts.target);
if (err < 0) {
fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
@@ -4426,7 +4430,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
goto out_delete_evlist;
}
- if (trace->summary_mode == SUMMARY__BY_TOTAL) {
+ if (trace->summary_mode == SUMMARY__BY_TOTAL && !trace->summary_bpf) {
trace->syscall_stats = alloc_syscall_stats();
if (trace->syscall_stats == NULL)
goto out_delete_evlist;
@@ -4512,9 +4516,11 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
if (err < 0)
goto out_error_apply_filters;
- err = evlist__mmap(evlist, trace->opts.mmap_pages);
- if (err < 0)
- goto out_error_mmap;
+ if (!trace->summary_bpf) {
+ err = evlist__mmap(evlist, trace->opts.mmap_pages);
+ if (err < 0)
+ goto out_error_mmap;
+ }
if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
evlist__enable(evlist);
@@ -4527,6 +4533,9 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
evlist__enable(evlist);
}
+ if (trace->summary_bpf)
+ trace_start_bpf_summary();
+
trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
perf_thread_map__nr(evlist->core.threads) > 1 ||
evlist__first(evlist)->core.attr.inherit;
@@ -4594,12 +4603,17 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
evlist__disable(evlist);
+ if (trace->summary_bpf)
+ trace_end_bpf_summary();
+
if (trace->sort_events)
ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
if (!err) {
if (trace->summary) {
- if (trace->summary_mode == SUMMARY__BY_TOTAL)
+ if (trace->summary_bpf)
+ trace_print_bpf_summary(trace->output);
+ else if (trace->summary_mode == SUMMARY__BY_TOTAL)
trace__fprintf_total_summary(trace, trace->output);
else
trace__fprintf_thread_summary(trace, trace->output);
@@ -4615,6 +4629,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
}
out_delete_evlist:
+ trace_cleanup_bpf_summary();
delete_syscall_stats(trace->syscall_stats);
trace__symbols__exit(trace);
evlist__free_syscall_tp_fields(evlist);
@@ -5444,6 +5459,7 @@ int cmd_trace(int argc, const char **argv)
"start"),
OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
"to customized ones"),
+ OPT_BOOLEAN(0, "bpf-summary", &trace.summary_bpf, "Summary syscall stats in BPF"),
OPTS_EVSWITCH(&trace.evswitch),
OPT_END()
};
@@ -5535,6 +5551,9 @@ int cmd_trace(int argc, const char **argv)
goto skip_augmentation;
}
+ if (trace.summary_only && trace.summary_bpf)
+ goto skip_augmentation;
+
trace.skel = augmented_raw_syscalls_bpf__open();
if (!trace.skel) {
pr_debug("Failed to open augmented syscalls BPF skeleton");
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 034a6603d5a8e8b0..ba4201a6f3c69753 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -171,6 +171,7 @@ perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf_off_cpu.o
perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter.o
perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-flex.o
perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-bison.o
+perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-trace-summary.o
perf-util-$(CONFIG_PERF_BPF_SKEL) += btf.o
ifeq ($(CONFIG_LIBTRACEEVENT),y)
diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
new file mode 100644
index 0000000000000000..5ae9feca244d5b22
--- /dev/null
+++ b/tools/perf/util/bpf-trace-summary.c
@@ -0,0 +1,334 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "dwarf-regs.h" /* for EM_HOST */
+#include "syscalltbl.h"
+#include "util/hashmap.h"
+#include "util/trace.h"
+#include "util/util.h"
+#include <bpf/bpf.h>
+#include <linux/time64.h>
+#include <tools/libc_compat.h> /* reallocarray */
+
+#include "bpf_skel/syscall_summary.h"
+#include "bpf_skel/syscall_summary.skel.h"
+
+
+static struct syscall_summary_bpf *skel;
+
+int trace_prepare_bpf_summary(enum trace_summary_mode mode)
+{
+ skel = syscall_summary_bpf__open();
+ if (skel == NULL) {
+ fprintf(stderr, "failed to open syscall summary bpf skeleton\n");
+ return -1;
+ }
+
+ if (mode == SUMMARY__BY_THREAD)
+ skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
+ else
+ skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
+
+ if (syscall_summary_bpf__load(skel) < 0) {
+ fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
+ return -1;
+ }
+
+ if (syscall_summary_bpf__attach(skel) < 0) {
+ fprintf(stderr, "failed to attach syscall summary bpf skeleton\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+void trace_start_bpf_summary(void)
+{
+ skel->bss->enabled = 1;
+}
+
+void trace_end_bpf_summary(void)
+{
+ skel->bss->enabled = 0;
+}
+
+struct syscall_node {
+ int syscall_nr;
+ struct syscall_stats stats;
+};
+
+static double rel_stddev(struct syscall_stats *stat)
+{
+ double variance, average;
+
+ if (stat->count < 2)
+ return 0;
+
+ average = (double)stat->total_time / stat->count;
+
+ variance = stat->squared_sum;
+ variance -= (stat->total_time * stat->total_time) / stat->count;
+ variance /= stat->count;
+
+ return 100 * sqrt(variance) / average;
+}
+
+struct syscall_data {
+ int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
+ int nr_events;
+ int nr_nodes;
+ u64 total_time;
+ struct syscall_node *nodes;
+};
+
+static int datacmp(const void *a, const void *b)
+{
+ const struct syscall_data * const *sa = a;
+ const struct syscall_data * const *sb = b;
+
+ return (*sa)->total_time > (*sb)->total_time ? -1 : 1;
+}
+
+static int nodecmp(const void *a, const void *b)
+{
+ const struct syscall_node *na = a;
+ const struct syscall_node *nb = b;
+
+ return na->stats.total_time > nb->stats.total_time ? -1 : 1;
+}
+
+static size_t sc_node_hash(long key, void *ctx __maybe_unused)
+{
+ return key;
+}
+
+static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused)
+{
+ return key1 == key2;
+}
+
+static int print_common_stats(struct syscall_data *data, FILE *fp)
+{
+ int printed = 0;
+
+ for (int i = 0; i < data->nr_nodes; i++) {
+ struct syscall_node *node = &data->nodes[i];
+ struct syscall_stats *stat = &node->stats;
+ double total = (double)(stat->total_time) / NSEC_PER_MSEC;
+ double min = (double)(stat->min_time) / NSEC_PER_MSEC;
+ double max = (double)(stat->max_time) / NSEC_PER_MSEC;
+ double avg = total / stat->count;
+ const char *name;
+
+ /* TODO: support other ABIs */
+ name = syscalltbl__name(EM_HOST, node->syscall_nr);
+ if (name)
+ printed += fprintf(fp, " %-15s", name);
+ else
+ printed += fprintf(fp, " syscall:%-7d", node->syscall_nr);
+
+ printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n",
+ stat->count, stat->error, total, min, avg, max,
+ rel_stddev(stat));
+ }
+ return printed;
+}
+
+static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key,
+ struct syscall_stats *map_data)
+{
+ struct syscall_data *data;
+ struct syscall_node *nodes;
+
+ if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) {
+ data = zalloc(sizeof(*data));
+ if (data == NULL)
+ return -ENOMEM;
+
+ data->key = map_key->cpu_or_tid;
+ if (hashmap__add(hash, data->key, data) < 0) {
+ free(data);
+ return -ENOMEM;
+ }
+ }
+
+ /* update thread total stats */
+ data->nr_events += map_data->count;
+ data->total_time += map_data->total_time;
+
+ nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
+ if (nodes == NULL)
+ return -ENOMEM;
+
+ data->nodes = nodes;
+ nodes = &data->nodes[data->nr_nodes++];
+ nodes->syscall_nr = map_key->nr;
+
+ /* each thread has an entry for each syscall, just use the stat */
+ memcpy(&nodes->stats, map_data, sizeof(*map_data));
+ return 0;
+}
+
+static int print_thread_stat(struct syscall_data *data, FILE *fp)
+{
+ int printed = 0;
+
+ qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
+
+ printed += fprintf(fp, " thread (%d), ", data->key);
+ printed += fprintf(fp, "%d events\n\n", data->nr_events);
+
+ printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
+ printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
+ printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
+
+ printed += print_common_stats(data, fp);
+ printed += fprintf(fp, "\n\n");
+
+ return printed;
+}
+
+static int print_thread_stats(struct syscall_data **data, int nr_data, FILE *fp)
+{
+ int printed = 0;
+
+ for (int i = 0; i < nr_data; i++)
+ printed += print_thread_stat(data[i], fp);
+
+ return printed;
+}
+
+static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
+ struct syscall_stats *map_data)
+{
+ struct syscall_data *data;
+ struct syscall_stats *stat;
+
+ if (!hashmap__find(hash, map_key, &data)) {
+ data = zalloc(sizeof(*data));
+ if (data == NULL)
+ return -ENOMEM;
+
+ data->nodes = zalloc(sizeof(*data->nodes));
+ if (data->nodes == NULL) {
+ free(data);
+ return -ENOMEM;
+ }
+
+ data->nr_nodes = 1;
+ data->key = map_key->nr;
+ data->nodes->syscall_nr = data->key;
+
+ if (hashmap__add(hash, data->key, data) < 0) {
+ free(data->nodes);
+ free(data);
+ return -ENOMEM;
+ }
+ }
+
+ /* update total stats for this syscall */
+ data->nr_events += map_data->count;
+ data->total_time += map_data->total_time;
+
+ /* This is sum of the same syscall from different CPUs */
+ stat = &data->nodes->stats;
+
+ stat->total_time += map_data->total_time;
+ stat->squared_sum += map_data->squared_sum;
+ stat->count += map_data->count;
+ stat->error += map_data->error;
+
+ if (stat->max_time < map_data->max_time)
+ stat->max_time = map_data->max_time;
+ if (stat->min_time > map_data->min_time)
+ stat->min_time = map_data->min_time;
+
+ return 0;
+}
+
+static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
+{
+ int printed = 0;
+ int nr_events = 0;
+
+ for (int i = 0; i < nr_data; i++)
+ nr_events += data[i]->nr_events;
+
+ printed += fprintf(fp, " total, %d events\n\n", nr_events);
+
+ printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
+ printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
+ printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
+
+ for (int i = 0; i < nr_data; i++)
+ printed += print_common_stats(data[i], fp);
+
+ printed += fprintf(fp, "\n\n");
+ return printed;
+}
+
+int trace_print_bpf_summary(FILE *fp)
+{
+ struct bpf_map *map = skel->maps.syscall_stats_map;
+ struct syscall_key *prev_key, key;
+ struct syscall_data **data = NULL;
+ struct hashmap schash;
+ struct hashmap_entry *entry;
+ int nr_data = 0;
+ int printed = 0;
+ int i;
+ size_t bkt;
+
+ hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
+
+ printed = fprintf(fp, "\n Summary of events:\n\n");
+
+ /* get stats from the bpf map */
+ prev_key = NULL;
+ while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
+ struct syscall_stats stat;
+
+ if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
+ if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
+ update_thread_stats(&schash, &key, &stat);
+ else
+ update_total_stats(&schash, &key, &stat);
+ }
+
+ prev_key = &key;
+ }
+
+ nr_data = hashmap__size(&schash);
+ data = calloc(nr_data, sizeof(*data));
+ if (data == NULL)
+ goto out;
+
+ i = 0;
+ hashmap__for_each_entry(&schash, entry, bkt)
+ data[i++] = entry->pvalue;
+
+ qsort(data, nr_data, sizeof(*data), datacmp);
+
+ if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
+ printed += print_thread_stats(data, nr_data, fp);
+ else
+ printed += print_total_stats(data, nr_data, fp);
+
+ for (i = 0; i < nr_data && data; i++) {
+ free(data[i]->nodes);
+ free(data[i]);
+ }
+ free(data);
+
+out:
+ hashmap__clear(&schash);
+ return printed;
+}
+
+void trace_cleanup_bpf_summary(void)
+{
+ syscall_summary_bpf__destroy(skel);
+}
diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
new file mode 100644
index 0000000000000000..b25f53b3c1351392
--- /dev/null
+++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trace raw_syscalls tracepoints to collect system call statistics.
+ */
+
+#include "vmlinux.h"
+#include "syscall_summary.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+/* This is to calculate a delta between sys-enter and sys-exit for each thread */
+struct syscall_trace {
+ int nr; /* syscall number is only available at sys-enter */
+ int unused;
+ u64 timestamp;
+};
+
+#define MAX_ENTRIES (128 * 1024)
+
+struct syscall_trace_map {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, int); /* tid */
+ __type(value, struct syscall_trace);
+ __uint(max_entries, MAX_ENTRIES);
+} syscall_trace_map SEC(".maps");
+
+struct syscall_stats_map {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct syscall_key);
+ __type(value, struct syscall_stats);
+ __uint(max_entries, MAX_ENTRIES);
+} syscall_stats_map SEC(".maps");
+
+int enabled; /* controlled from userspace */
+
+const volatile enum syscall_aggr_mode aggr_mode;
+
+static void update_stats(int cpu_or_tid, int nr, s64 duration, long ret)
+{
+ struct syscall_key key = { .cpu_or_tid = cpu_or_tid, .nr = nr, };
+ struct syscall_stats *stats;
+
+ stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
+ if (stats == NULL) {
+ struct syscall_stats zero = {};
+
+ bpf_map_update_elem(&syscall_stats_map, &key, &zero, BPF_NOEXIST);
+ stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
+ if (stats == NULL)
+ return;
+ }
+
+ __sync_fetch_and_add(&stats->count, 1);
+ if (ret < 0)
+ __sync_fetch_and_add(&stats->error, 1);
+
+ if (duration > 0) {
+ __sync_fetch_and_add(&stats->total_time, duration);
+ __sync_fetch_and_add(&stats->squared_sum, duration * duration);
+ if (stats->max_time < duration)
+ stats->max_time = duration;
+ if (stats->min_time > duration || stats->min_time == 0)
+ stats->min_time = duration;
+ }
+
+ return;
+}
+
+SEC("tp_btf/sys_enter")
+int sys_enter(u64 *ctx)
+{
+ int tid;
+ struct syscall_trace st;
+
+ if (!enabled)
+ return 0;
+
+ st.nr = ctx[1]; /* syscall number */
+ st.unused = 0;
+ st.timestamp = bpf_ktime_get_ns();
+
+ tid = bpf_get_current_pid_tgid();
+ bpf_map_update_elem(&syscall_trace_map, &tid, &st, BPF_ANY);
+
+ return 0;
+}
+
+SEC("tp_btf/sys_exit")
+int sys_exit(u64 *ctx)
+{
+ int tid;
+ int key;
+ long ret = ctx[1]; /* return value of the syscall */
+ struct syscall_trace *st;
+ s64 delta;
+
+ if (!enabled)
+ return 0;
+
+ tid = bpf_get_current_pid_tgid();
+ st = bpf_map_lookup_elem(&syscall_trace_map, &tid);
+ if (st == NULL)
+ return 0;
+
+ if (aggr_mode == SYSCALL_AGGR_THREAD)
+ key = tid;
+ else
+ key = bpf_get_smp_processor_id();
+
+ delta = bpf_ktime_get_ns() - st->timestamp;
+ update_stats(key, st->nr, delta, ret);
+
+ bpf_map_delete_elem(&syscall_trace_map, &tid);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/perf/util/bpf_skel/syscall_summary.h b/tools/perf/util/bpf_skel/syscall_summary.h
new file mode 100644
index 0000000000000000..17f9ecba657088aa
--- /dev/null
+++ b/tools/perf/util/bpf_skel/syscall_summary.h
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Data structures shared between BPF and tools. */
+#ifndef UTIL_BPF_SKEL_SYSCALL_SUMMARY_H
+#define UTIL_BPF_SKEL_SYSCALL_SUMMARY_H
+
+enum syscall_aggr_mode {
+ SYSCALL_AGGR_THREAD,
+ SYSCALL_AGGR_CPU,
+};
+
+struct syscall_key {
+ int cpu_or_tid;
+ int nr;
+};
+
+struct syscall_stats {
+ u64 total_time;
+ u64 squared_sum;
+ u64 max_time;
+ u64 min_time;
+ u32 count;
+ u32 error;
+};
+
+#endif /* UTIL_BPF_SKEL_SYSCALL_SUMMARY_H */
diff --git a/tools/perf/util/trace.h b/tools/perf/util/trace.h
new file mode 100644
index 0000000000000000..ef8361ed12c4edc1
--- /dev/null
+++ b/tools/perf/util/trace.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef UTIL_TRACE_H
+#define UTIL_TRACE_H
+
+#include <stdio.h> /* for FILE */
+
+enum trace_summary_mode {
+ SUMMARY__NONE = 0,
+ SUMMARY__BY_TOTAL,
+ SUMMARY__BY_THREAD,
+};
+
+#ifdef HAVE_BPF_SKEL
+
+int trace_prepare_bpf_summary(enum trace_summary_mode mode);
+void trace_start_bpf_summary(void);
+void trace_end_bpf_summary(void);
+int trace_print_bpf_summary(FILE *fp);
+void trace_cleanup_bpf_summary(void);
+
+#else /* !HAVE_BPF_SKEL */
+
+static inline int trace_prepare_bpf_summary(enum trace_summary_mode mode __maybe_unused)
+{
+ return -1;
+}
+static inline void trace_start_bpf_summary(void) {}
+static inline void trace_end_bpf_summary(void) {}
+static inline int trace_print_bpf_summary(FILE *fp __maybe_unused)
+{
+ return 0;
+}
+static inline void trace_cleanup_bpf_summary(void) {}
+
+#endif /* HAVE_BPF_SKEL */
+
+#endif /* UTIL_TRACE_H */
--
2.49.0.rc1.451.g8f38331e32-goog
Hello again Namhyung,
As funny as it sounds, I have too much homework this week. I had to
break the review into two parts. Sorry.
1) Maybe just '--bpf-summary' instead?
First of all, is '-s --bpf-summary' is it ergonomic? Why not drop the
-s and just --bpf-summary since the option has 'summary' in its name.
Another reason being,
sudo ./perf trace -S --bpf-summary --summary-mode=total -- sleep 1
sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
are the same (-S will emit no output to stdout).
2) Anomaly observed when playing around
sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
this gave me 10000 events
sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1
while this gave me 1000 events
I guess it's something to do with the lost events?
3) Wrong stddev values
Please compare these two outputs
perf $ sudo ./perf trace -as --summary-mode=total -- sleep 1
Summary of events:
total, 11290 events
syscall calls errors total min avg
max stddev
(msec) (msec) (msec)
(msec) (%)
--------------- -------- ------ -------- --------- ---------
--------- ------
mq_open 214 71 16073.976 0.000 75.112
250.120 9.91%
futex 1296 195 11592.060 0.000 8.944
907.590 13.59%
epoll_wait 479 0 4262.456 0.000 8.899
496.568 20.34%
poll 241 0 2545.090 0.000 10.561
607.894 33.33%
ppoll 330 0 1713.676 0.000 5.193
410.143 26.45%
migrate_pages 45 0 1031.915 0.000 22.931
147.830 20.70%
clock_nanosleep 2 0 1000.106 0.000 500.053
1000.106 100.00%
swapoff 340 0 909.827 0.000 2.676
50.117 22.76%
pselect6 5 0 604.816 0.000 120.963
604.808 100.00%
readlinkat 26 3 501.205 0.000 19.277
499.998 99.75%
perf $ sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1
Summary of events:
total, 880 events
syscall calls errors total min avg
max stddev
(msec) (msec) (msec)
(msec) (%)
--------------- -------- ------ -------- --------- ---------
--------- ------
futex 219 46 2326.400 0.001 10.623
243.028 337.77%
mq_open 19 8 2001.347 0.003 105.334
250.356 117.26%
poll 6 1 1002.512 0.002 167.085
1002.496 223.60%
clock_nanosleep 1 0 1000.147 1000.147 1000.147
1000.147 0.00%
swapoff 43 0 953.251 0.001 22.169
50.390 112.37%
migrate_pages 43 0 933.727 0.004 21.715
49.149 106.68%
ppoll 32 0 838.035 0.002 26.189
331.222 252.10%
epoll_pwait 5 0 499.578 0.001 99.916
499.565 199.99%
nanosleep 1 0 10.149 10.149 10.149
10.149 0.00%
epoll_wait 10 0 3.449 0.003 0.345
0.815 88.02%
readlinkat 25 3 1.424 0.006 0.057
0.080 41.76%
recvmsg 61 0 1.326 0.016 0.022
0.052 21.71%
execve 6 5 1.100 0.002 0.183
1.078 218.21%
I would say stddev here is a little off. The reason is:
On Mon, Mar 17, 2025 at 11:08 AM Namhyung Kim <namhyung@kernel.org> wrote:
>
> When -s/--summary option is used, it doesn't need (augmented) arguments
> of syscalls. Let's skip the augmentation and load another small BPF
> program to collect the statistics in the kernel instead of copying the
> data to the ring-buffer to calculate the stats in userspace. This will
> be much more light-weight than the existing approach and remove any lost
> events.
>
> Let's add a new option --bpf-summary to control this behavior. I cannot
> make it default because there's no way to get e_machine in the BPF which
> is needed for detecting different ABIs like 32-bit compat mode.
>
> No functional changes intended except for no more LOST events. :)
>
> $ sudo perf trace -as --bpf-summary --summary-mode=total -- sleep 1
>
> Summary of events:
>
> total, 2824 events
>
> syscall calls errors total min avg max stddev
> (msec) (msec) (msec) (msec) (%)
> --------------- -------- ------ -------- --------- --------- --------- ------
> futex 372 18 4373.773 0.000 11.757 997.715 660.42%
> poll 241 0 2757.963 0.000 11.444 997.758 580.34%
> epoll_wait 161 0 2460.854 0.000 15.285 325.189 260.73%
> ppoll 19 0 1298.652 0.000 68.350 667.172 281.46%
> clock_nanosleep 1 0 1000.093 0.000 1000.093 1000.093 0.00%
> epoll_pwait 16 0 192.787 0.000 12.049 173.994 348.73%
> nanosleep 6 0 50.926 0.000 8.488 10.210 43.96%
> ...
>
> Cc: Howard Chu <howardchu95@gmail.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> v2)
> * rebased on top of Ian's e_machine changes
> * add --bpf-summary option
> * support per-thread summary
> * add stddev calculation (Howard)
<SNIP>
> +static double rel_stddev(struct syscall_stats *stat)
> +{
> + double variance, average;
> +
> + if (stat->count < 2)
> + return 0;
> +
> + average = (double)stat->total_time / stat->count;
> +
> + variance = stat->squared_sum;
> + variance -= (stat->total_time * stat->total_time) / stat->count;
> + variance /= stat->count;
isn't it 'variance /= stat->count - 1' because we used Bessel's
correction? (Link:
https://en.wikipedia.org/wiki/Bessel%27s_correction), that is to use n
- 1 instead of n, this is what's done in stat.c.
* (\Sum n_i^2) - ((\Sum n_i)^2)/n
* s^2 = -------------------------------
* n - 1
and the lines down here are unfortunately incorrect
+ variance = stat->squared_sum;
+ variance -= (stat->total_time * stat->total_time) / stat->count;
+ variance /= stat->count;
+
+ return 100 * sqrt(variance) / average;
variance /= stat->count - 1; will get you variance, but I think we
need variance mean.
Link: https://en.wikipedia.org/wiki/Standard_deviation#Relationship_between_standard_deviation_and_mean
it holds that:
variance(mean) = variance / N
so you are losing a '/ stat->count'
And with all due respect, although it makes total sense in
engineering, mathematically, I find variance = stat->squared_sum,
variance -= ... these accumulated calculations on variable 'variance'
a little weird... because readers may find difficult to determine at
which point it becomes the actual 'variance'
with clarity in mind:
diff --git a/tools/perf/util/bpf-trace-summary.c
b/tools/perf/util/bpf-trace-summary.c
index 5ae9feca244d..a435b4037082 100644
--- a/tools/perf/util/bpf-trace-summary.c
+++ b/tools/perf/util/bpf-trace-summary.c
@@ -62,18 +62,18 @@ struct syscall_node {
static double rel_stddev(struct syscall_stats *stat)
{
- double variance, average;
+ double variance, average, squared_total;
if (stat->count < 2)
return 0;
average = (double)stat->total_time / stat->count;
- variance = stat->squared_sum;
- variance -= (stat->total_time * stat->total_time) / stat->count;
- variance /= stat->count;
+ squared_total = stat->total_time * stat->total_time;
+ variance = (stat->squared_sum - squared_total / stat->count) /
(stat->count - 1);
+ stddev_mean = sqrt(variance / stat->count);
- return 100 * sqrt(variance) / average;
+ return 100 * stddev_mean / average;
}
btw I haven't checked the legal range for stddev_mean, so I can be wrong.
<SNIP>
> +static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
> + struct syscall_stats *map_data)
> +{
> + struct syscall_data *data;
> + struct syscall_stats *stat;
> +
> + if (!hashmap__find(hash, map_key, &data)) {
> + data = zalloc(sizeof(*data));
> + if (data == NULL)
> + return -ENOMEM;
> +
> + data->nodes = zalloc(sizeof(*data->nodes));
> + if (data->nodes == NULL) {
> + free(data);
> + return -ENOMEM;
> + }
> +
> + data->nr_nodes = 1;
> + data->key = map_key->nr;
> + data->nodes->syscall_nr = data->key;
Wow, aggressive. I guess you want it to behave like a single value
when it is SYSCALL_AGGR_CPU, and an array when it is
SYSCALL_AGGR_THREAD. Do you mind adding a comment about it?
so it's
(cpu, syscall_nr) -> data -> {node}
(tid, syscall_nr) -> data -> [node1, node2, node3]
> +
> + if (hashmap__add(hash, data->key, data) < 0) {
> + free(data->nodes);
> + free(data);
> + return -ENOMEM;
> + }
> + }
> +
> + /* update total stats for this syscall */
> + data->nr_events += map_data->count;
> + data->total_time += map_data->total_time;
> +
> + /* This is sum of the same syscall from different CPUs */
> + stat = &data->nodes->stats;
> +
> + stat->total_time += map_data->total_time;
> + stat->squared_sum += map_data->squared_sum;
> + stat->count += map_data->count;
> + stat->error += map_data->error;
> +
> + if (stat->max_time < map_data->max_time)
> + stat->max_time = map_data->max_time;
> + if (stat->min_time > map_data->min_time)
> + stat->min_time = map_data->min_time;
> +
> + return 0;
> +}
> +
> +static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
> +{
> + int printed = 0;
> + int nr_events = 0;
> +
> + for (int i = 0; i < nr_data; i++)
> + nr_events += data[i]->nr_events;
> +
> + printed += fprintf(fp, " total, %d events\n\n", nr_events);
> +
> + printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
> + printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
> + printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
> +
> + for (int i = 0; i < nr_data; i++)
> + printed += print_common_stats(data[i], fp);
> +
> + printed += fprintf(fp, "\n\n");
> + return printed;
> +}
> +
> +int trace_print_bpf_summary(FILE *fp)
> +{
> + struct bpf_map *map = skel->maps.syscall_stats_map;
> + struct syscall_key *prev_key, key;
> + struct syscall_data **data = NULL;
> + struct hashmap schash;
> + struct hashmap_entry *entry;
> + int nr_data = 0;
> + int printed = 0;
> + int i;
> + size_t bkt;
> +
> + hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
> +
> + printed = fprintf(fp, "\n Summary of events:\n\n");
> +
> + /* get stats from the bpf map */
> + prev_key = NULL;
> + while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
> + struct syscall_stats stat;
> +
> + if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
> + if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> + update_thread_stats(&schash, &key, &stat);
> + else
> + update_total_stats(&schash, &key, &stat);
> + }
> +
> + prev_key = &key;
> + }
> +
> + nr_data = hashmap__size(&schash);
> + data = calloc(nr_data, sizeof(*data));
> + if (data == NULL)
> + goto out;
> +
> + i = 0;
> + hashmap__for_each_entry(&schash, entry, bkt)
> + data[i++] = entry->pvalue;
> +
> + qsort(data, nr_data, sizeof(*data), datacmp);
Here syscall_data is sorted for AGGR_THREAD and AGGR_CPU, meaning the
thread who has the higher total syscall period will be printed first.
This is an awesome side effect but it is not the behavior of 'sudo
./perf trace -as -- sleep 1' without the --bpf-summary option. If it
is not too trivial, maybe consider documenting this behavior? But it
may be too verbose so Idk.
sudo ./perf trace -as -- sleep 1
ClientModuleMan (4956), 16 events, 0.1%
syscall calls errors total min avg
max stddev
(msec) (msec) (msec)
(msec) (%)
--------------- -------- ------ -------- --------- ---------
--------- ------
futex 8 4 750.234 0.000 93.779
250.105 48.79%
CHTTPClientThre (15720), 16 events, 0.1%
syscall calls errors total min avg
max stddev
(msec) (msec) (msec)
(msec) (%)
--------------- -------- ------ -------- --------- ---------
--------- ------
futex 8 4 1000.425 0.000 125.053
750.317 75.59%
The order is random for the command above.
Thanks,
Howard
On Thu, Mar 20, 2025 at 07:35:01PM -0700, Howard Chu wrote:
> Hello again Namhyung,
>
> As funny as it sounds, I have too much homework this week. I had to
> break the review into two parts. Sorry.
Thanks for taking your time!
>
> 1) Maybe just '--bpf-summary' instead?
>
> First of all, is '-s --bpf-summary' is it ergonomic? Why not drop the
> -s and just --bpf-summary since the option has 'summary' in its name.
> Another reason being,
> sudo ./perf trace -S --bpf-summary --summary-mode=total -- sleep 1
> sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
> are the same (-S will emit no output to stdout).
Hmm.. it looks like a bug, will take a look. Maybe --bpf-summary is
redundant, but I think we can make it default later so that -s/-S can
use BPF without the option excplicitly. Then this option can be used
to disable it (--no-bpf-summary) for whatever reasons.
>
> 2) Anomaly observed when playing around
>
> sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
> this gave me 10000 events
>
> sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1
> while this gave me 1000 events
>
> I guess it's something to do with the lost events?
No, as you said in the previous message it didn't support process
targets yet. I plan to disable it without -a for now.
>
> 3) Wrong stddev values
> Please compare these two outputs
>
> perf $ sudo ./perf trace -as --summary-mode=total -- sleep 1
>
> Summary of events:
>
> total, 11290 events
>
> syscall calls errors total min avg
> max stddev
> (msec) (msec) (msec)
> (msec) (%)
> --------------- -------- ------ -------- --------- ---------
> --------- ------
> mq_open 214 71 16073.976 0.000 75.112
> 250.120 9.91%
> futex 1296 195 11592.060 0.000 8.944
> 907.590 13.59%
> epoll_wait 479 0 4262.456 0.000 8.899
> 496.568 20.34%
> poll 241 0 2545.090 0.000 10.561
> 607.894 33.33%
> ppoll 330 0 1713.676 0.000 5.193
> 410.143 26.45%
> migrate_pages 45 0 1031.915 0.000 22.931
> 147.830 20.70%
> clock_nanosleep 2 0 1000.106 0.000 500.053
> 1000.106 100.00%
> swapoff 340 0 909.827 0.000 2.676
> 50.117 22.76%
> pselect6 5 0 604.816 0.000 120.963
> 604.808 100.00%
> readlinkat 26 3 501.205 0.000 19.277
> 499.998 99.75%
>
> perf $ sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1
>
> Summary of events:
>
> total, 880 events
>
> syscall calls errors total min avg
> max stddev
> (msec) (msec) (msec)
> (msec) (%)
> --------------- -------- ------ -------- --------- ---------
> --------- ------
> futex 219 46 2326.400 0.001 10.623
> 243.028 337.77%
> mq_open 19 8 2001.347 0.003 105.334
> 250.356 117.26%
> poll 6 1 1002.512 0.002 167.085
> 1002.496 223.60%
> clock_nanosleep 1 0 1000.147 1000.147 1000.147
> 1000.147 0.00%
> swapoff 43 0 953.251 0.001 22.169
> 50.390 112.37%
> migrate_pages 43 0 933.727 0.004 21.715
> 49.149 106.68%
> ppoll 32 0 838.035 0.002 26.189
> 331.222 252.10%
> epoll_pwait 5 0 499.578 0.001 99.916
> 499.565 199.99%
> nanosleep 1 0 10.149 10.149 10.149
> 10.149 0.00%
> epoll_wait 10 0 3.449 0.003 0.345
> 0.815 88.02%
> readlinkat 25 3 1.424 0.006 0.057
> 0.080 41.76%
> recvmsg 61 0 1.326 0.016 0.022
> 0.052 21.71%
> execve 6 5 1.100 0.002 0.183
> 1.078 218.21%
>
> I would say stddev here is a little off. The reason is:
>
> On Mon, Mar 17, 2025 at 11:08 AM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > When -s/--summary option is used, it doesn't need (augmented) arguments
> > of syscalls. Let's skip the augmentation and load another small BPF
> > program to collect the statistics in the kernel instead of copying the
> > data to the ring-buffer to calculate the stats in userspace. This will
> > be much more light-weight than the existing approach and remove any lost
> > events.
> >
> > Let's add a new option --bpf-summary to control this behavior. I cannot
> > make it default because there's no way to get e_machine in the BPF which
> > is needed for detecting different ABIs like 32-bit compat mode.
> >
> > No functional changes intended except for no more LOST events. :)
> >
> > $ sudo perf trace -as --bpf-summary --summary-mode=total -- sleep 1
> >
> > Summary of events:
> >
> > total, 2824 events
> >
> > syscall calls errors total min avg max stddev
> > (msec) (msec) (msec) (msec) (%)
> > --------------- -------- ------ -------- --------- --------- --------- ------
> > futex 372 18 4373.773 0.000 11.757 997.715 660.42%
> > poll 241 0 2757.963 0.000 11.444 997.758 580.34%
> > epoll_wait 161 0 2460.854 0.000 15.285 325.189 260.73%
> > ppoll 19 0 1298.652 0.000 68.350 667.172 281.46%
> > clock_nanosleep 1 0 1000.093 0.000 1000.093 1000.093 0.00%
> > epoll_pwait 16 0 192.787 0.000 12.049 173.994 348.73%
> > nanosleep 6 0 50.926 0.000 8.488 10.210 43.96%
> > ...
> >
> > Cc: Howard Chu <howardchu95@gmail.com>
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> > v2)
> > * rebased on top of Ian's e_machine changes
> > * add --bpf-summary option
> > * support per-thread summary
> > * add stddev calculation (Howard)
> <SNIP>
> > +static double rel_stddev(struct syscall_stats *stat)
> > +{
> > + double variance, average;
> > +
> > + if (stat->count < 2)
> > + return 0;
> > +
> > + average = (double)stat->total_time / stat->count;
> > +
> > + variance = stat->squared_sum;
> > + variance -= (stat->total_time * stat->total_time) / stat->count;
> > + variance /= stat->count;
>
> isn't it 'variance /= stat->count - 1' because we used Bessel's
> correction? (Link:
> https://en.wikipedia.org/wiki/Bessel%27s_correction), that is to use n
> - 1 instead of n, this is what's done in stat.c.
>
> * (\Sum n_i^2) - ((\Sum n_i)^2)/n
> * s^2 = -------------------------------
> * n - 1
>
> and the lines down here are unfortunately incorrect
> + variance = stat->squared_sum;
> + variance -= (stat->total_time * stat->total_time) / stat->count;
> + variance /= stat->count;
> +
> + return 100 * sqrt(variance) / average;
>
> variance /= stat->count - 1; will get you variance, but I think we
> need variance mean.
> Link: https://en.wikipedia.org/wiki/Standard_deviation#Relationship_between_standard_deviation_and_mean
>
> it holds that:
> variance(mean) = variance / N
>
> so you are losing a '/ stat->count'
You're right, thanks for pointing that out.
>
> And with all due respect, although it makes total sense in
> engineering, mathematically, I find variance = stat->squared_sum,
> variance -= ... these accumulated calculations on variable 'variance'
> a little weird... because readers may find difficult to determine at
> which point it becomes the actual 'variance'
>
> with clarity in mind:
>
> diff --git a/tools/perf/util/bpf-trace-summary.c
> b/tools/perf/util/bpf-trace-summary.c
> index 5ae9feca244d..a435b4037082 100644
> --- a/tools/perf/util/bpf-trace-summary.c
> +++ b/tools/perf/util/bpf-trace-summary.c
> @@ -62,18 +62,18 @@ struct syscall_node {
>
> static double rel_stddev(struct syscall_stats *stat)
> {
> - double variance, average;
> + double variance, average, squared_total;
>
> if (stat->count < 2)
> return 0;
>
> average = (double)stat->total_time / stat->count;
>
> - variance = stat->squared_sum;
> - variance -= (stat->total_time * stat->total_time) / stat->count;
> - variance /= stat->count;
> + squared_total = stat->total_time * stat->total_time;
> + variance = (stat->squared_sum - squared_total / stat->count) /
> (stat->count - 1);
> + stddev_mean = sqrt(variance / stat->count);
>
> - return 100 * sqrt(variance) / average;
> + return 100 * stddev_mean / average;
> }
Can it be like this?
diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
index a91d42447e850a59..c897fb017914960c 100644
--- a/tools/perf/util/bpf-trace-summary.c
+++ b/tools/perf/util/bpf-trace-summary.c
@@ -71,9 +71,9 @@ static double rel_stddev(struct syscall_stats *stat)
variance = stat->squared_sum;
variance -= (stat->total_time * stat->total_time) / stat->count;
- variance /= stat->count;
+ variance /= stat->count - 1;
- return 100 * sqrt(variance) / average;
+ return 100 * sqrt(variance / stat->count) / average;
}
struct syscall_data {
>
> btw I haven't checked the legal range for stddev_mean, so I can be wrong.
> <SNIP>
> > +static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
> > + struct syscall_stats *map_data)
> > +{
> > + struct syscall_data *data;
> > + struct syscall_stats *stat;
> > +
> > + if (!hashmap__find(hash, map_key, &data)) {
> > + data = zalloc(sizeof(*data));
> > + if (data == NULL)
> > + return -ENOMEM;
> > +
> > + data->nodes = zalloc(sizeof(*data->nodes));
> > + if (data->nodes == NULL) {
> > + free(data);
> > + return -ENOMEM;
> > + }
> > +
> > + data->nr_nodes = 1;
> > + data->key = map_key->nr;
> > + data->nodes->syscall_nr = data->key;
> Wow, aggressive. I guess you want it to behave like a single value
> when it is SYSCALL_AGGR_CPU, and an array when it is
> SYSCALL_AGGR_THREAD. Do you mind adding a comment about it?
>
> so it's
>
> (cpu, syscall_nr) -> data -> {node}
> (tid, syscall_nr) -> data -> [node1, node2, node3]
Right, will add comments.
>
>
> > +
> > + if (hashmap__add(hash, data->key, data) < 0) {
> > + free(data->nodes);
> > + free(data);
> > + return -ENOMEM;
> > + }
> > + }
> > +
> > + /* update total stats for this syscall */
> > + data->nr_events += map_data->count;
> > + data->total_time += map_data->total_time;
> > +
> > + /* This is sum of the same syscall from different CPUs */
> > + stat = &data->nodes->stats;
> > +
> > + stat->total_time += map_data->total_time;
> > + stat->squared_sum += map_data->squared_sum;
> > + stat->count += map_data->count;
> > + stat->error += map_data->error;
> > +
> > + if (stat->max_time < map_data->max_time)
> > + stat->max_time = map_data->max_time;
> > + if (stat->min_time > map_data->min_time)
> > + stat->min_time = map_data->min_time;
> > +
> > + return 0;
> > +}
> > +
> > +static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
> > +{
> > + int printed = 0;
> > + int nr_events = 0;
> > +
> > + for (int i = 0; i < nr_data; i++)
> > + nr_events += data[i]->nr_events;
> > +
> > + printed += fprintf(fp, " total, %d events\n\n", nr_events);
> > +
> > + printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
> > + printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
> > + printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
> > +
> > + for (int i = 0; i < nr_data; i++)
> > + printed += print_common_stats(data[i], fp);
> > +
> > + printed += fprintf(fp, "\n\n");
> > + return printed;
> > +}
> > +
> > +int trace_print_bpf_summary(FILE *fp)
> > +{
> > + struct bpf_map *map = skel->maps.syscall_stats_map;
> > + struct syscall_key *prev_key, key;
> > + struct syscall_data **data = NULL;
> > + struct hashmap schash;
> > + struct hashmap_entry *entry;
> > + int nr_data = 0;
> > + int printed = 0;
> > + int i;
> > + size_t bkt;
> > +
> > + hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
> > +
> > + printed = fprintf(fp, "\n Summary of events:\n\n");
> > +
> > + /* get stats from the bpf map */
> > + prev_key = NULL;
> > + while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
> > + struct syscall_stats stat;
> > +
> > + if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
> > + if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> > + update_thread_stats(&schash, &key, &stat);
> > + else
> > + update_total_stats(&schash, &key, &stat);
> > + }
> > +
> > + prev_key = &key;
> > + }
> > +
> > + nr_data = hashmap__size(&schash);
> > + data = calloc(nr_data, sizeof(*data));
> > + if (data == NULL)
> > + goto out;
> > +
> > + i = 0;
> > + hashmap__for_each_entry(&schash, entry, bkt)
> > + data[i++] = entry->pvalue;
> > +
> > + qsort(data, nr_data, sizeof(*data), datacmp);
>
> Here syscall_data is sorted for AGGR_THREAD and AGGR_CPU, meaning the
> thread who has the higher total syscall period will be printed first.
> This is an awesome side effect but it is not the behavior of 'sudo
> ./perf trace -as -- sleep 1' without the --bpf-summary option. If it
> is not too trivial, maybe consider documenting this behavior? But it
> may be too verbose so Idk.
The original behavior is random ordering and now it's ordered by total
time. But still we can think it's randomly ordered. I believe we need
to maintain strict ordering by total time and then fixed the existing
code. Once that happen I can add the documentation.
Thanks for your careful review!
Namhyung
>
> sudo ./perf trace -as -- sleep 1
>
> ClientModuleMan (4956), 16 events, 0.1%
>
> syscall calls errors total min avg
> max stddev
> (msec) (msec) (msec)
> (msec) (%)
> --------------- -------- ------ -------- --------- ---------
> --------- ------
> futex 8 4 750.234 0.000 93.779
> 250.105 48.79%
>
>
> CHTTPClientThre (15720), 16 events, 0.1%
>
> syscall calls errors total min avg
> max stddev
> (msec) (msec) (msec)
> (msec) (%)
> --------------- -------- ------ -------- --------- ---------
> --------- ------
> futex 8 4 1000.425 0.000 125.053
> 750.317 75.59%
>
> The order is random for the command above.
>
> Thanks,
> Howard
Hello Namhyung,
On Thu, Mar 20, 2025 at 10:54 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> On Thu, Mar 20, 2025 at 07:35:01PM -0700, Howard Chu wrote:
> > Hello again Namhyung,
> >
> > As funny as it sounds, I have too much homework this week. I had to
> > break the review into two parts. Sorry.
>
> Thanks for taking your time!
>
> >
> > 1) Maybe just '--bpf-summary' instead?
> >
> > First of all, is '-s --bpf-summary' is it ergonomic? Why not drop the
> > -s and just --bpf-summary since the option has 'summary' in its name.
> > Another reason being,
> > sudo ./perf trace -S --bpf-summary --summary-mode=total -- sleep 1
> > sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
> > are the same (-S will emit no output to stdout).
>
> Hmm.. it looks like a bug, will take a look. Maybe --bpf-summary is
> redundant, but I think we can make it default later so that -s/-S can
> use BPF without the option excplicitly. Then this option can be used
> to disable it (--no-bpf-summary) for whatever reasons.
>
> >
> > 2) Anomaly observed when playing around
> >
> > sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
> > this gave me 10000 events
> >
> > sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1
> > while this gave me 1000 events
> >
> > I guess it's something to do with the lost events?
>
> No, as you said in the previous message it didn't support process
> targets yet. I plan to disable it without -a for now.
I copied the wrong command in the email, it should be:
there is a difference between
sudo ./perf trace -as --summary-mode=total -- sleep 1
sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1
perf $ sudo ./perf trace -as --summary-mode=total -- sleep 1
[sudo] password for howard:
Summary of events:
total, 15354 events
perf $ sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1
Summary of events:
total, 1319 events
without the --bpf-summary perf trace gave more events, and it ran slower
as for 'I plan to disable it without -a for now.' I think this makes sense.
>
> >
> > 3) Wrong stddev values
> > Please compare these two outputs
> >
> > perf $ sudo ./perf trace -as --summary-mode=total -- sleep 1
> >
> > Summary of events:
> >
> > total, 11290 events
> >
> > syscall calls errors total min avg
> > max stddev
> > (msec) (msec) (msec)
> > (msec) (%)
> > --------------- -------- ------ -------- --------- ---------
> > --------- ------
> > mq_open 214 71 16073.976 0.000 75.112
> > 250.120 9.91%
> > futex 1296 195 11592.060 0.000 8.944
> > 907.590 13.59%
> > epoll_wait 479 0 4262.456 0.000 8.899
> > 496.568 20.34%
> > poll 241 0 2545.090 0.000 10.561
> > 607.894 33.33%
> > ppoll 330 0 1713.676 0.000 5.193
> > 410.143 26.45%
> > migrate_pages 45 0 1031.915 0.000 22.931
> > 147.830 20.70%
> > clock_nanosleep 2 0 1000.106 0.000 500.053
> > 1000.106 100.00%
> > swapoff 340 0 909.827 0.000 2.676
> > 50.117 22.76%
> > pselect6 5 0 604.816 0.000 120.963
> > 604.808 100.00%
> > readlinkat 26 3 501.205 0.000 19.277
> > 499.998 99.75%
> >
> > perf $ sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1
> >
> > Summary of events:
> >
> > total, 880 events
> >
> > syscall calls errors total min avg
> > max stddev
> > (msec) (msec) (msec)
> > (msec) (%)
> > --------------- -------- ------ -------- --------- ---------
> > --------- ------
> > futex 219 46 2326.400 0.001 10.623
> > 243.028 337.77%
> > mq_open 19 8 2001.347 0.003 105.334
> > 250.356 117.26%
> > poll 6 1 1002.512 0.002 167.085
> > 1002.496 223.60%
> > clock_nanosleep 1 0 1000.147 1000.147 1000.147
> > 1000.147 0.00%
> > swapoff 43 0 953.251 0.001 22.169
> > 50.390 112.37%
> > migrate_pages 43 0 933.727 0.004 21.715
> > 49.149 106.68%
> > ppoll 32 0 838.035 0.002 26.189
> > 331.222 252.10%
> > epoll_pwait 5 0 499.578 0.001 99.916
> > 499.565 199.99%
> > nanosleep 1 0 10.149 10.149 10.149
> > 10.149 0.00%
> > epoll_wait 10 0 3.449 0.003 0.345
> > 0.815 88.02%
> > readlinkat 25 3 1.424 0.006 0.057
> > 0.080 41.76%
> > recvmsg 61 0 1.326 0.016 0.022
> > 0.052 21.71%
> > execve 6 5 1.100 0.002 0.183
> > 1.078 218.21%
> >
> > I would say stddev here is a little off. The reason is:
> >
> > On Mon, Mar 17, 2025 at 11:08 AM Namhyung Kim <namhyung@kernel.org> wrote:
> > >
> > > When -s/--summary option is used, it doesn't need (augmented) arguments
> > > of syscalls. Let's skip the augmentation and load another small BPF
> > > program to collect the statistics in the kernel instead of copying the
> > > data to the ring-buffer to calculate the stats in userspace. This will
> > > be much more light-weight than the existing approach and remove any lost
> > > events.
> > >
> > > Let's add a new option --bpf-summary to control this behavior. I cannot
> > > make it default because there's no way to get e_machine in the BPF which
> > > is needed for detecting different ABIs like 32-bit compat mode.
> > >
> > > No functional changes intended except for no more LOST events. :)
> > >
> > > $ sudo perf trace -as --bpf-summary --summary-mode=total -- sleep 1
> > >
> > > Summary of events:
> > >
> > > total, 2824 events
> > >
> > > syscall calls errors total min avg max stddev
> > > (msec) (msec) (msec) (msec) (%)
> > > --------------- -------- ------ -------- --------- --------- --------- ------
> > > futex 372 18 4373.773 0.000 11.757 997.715 660.42%
> > > poll 241 0 2757.963 0.000 11.444 997.758 580.34%
> > > epoll_wait 161 0 2460.854 0.000 15.285 325.189 260.73%
> > > ppoll 19 0 1298.652 0.000 68.350 667.172 281.46%
> > > clock_nanosleep 1 0 1000.093 0.000 1000.093 1000.093 0.00%
> > > epoll_pwait 16 0 192.787 0.000 12.049 173.994 348.73%
> > > nanosleep 6 0 50.926 0.000 8.488 10.210 43.96%
> > > ...
> > >
> > > Cc: Howard Chu <howardchu95@gmail.com>
> > > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > > ---
> > > v2)
> > > * rebased on top of Ian's e_machine changes
> > > * add --bpf-summary option
> > > * support per-thread summary
> > > * add stddev calculation (Howard)
> > <SNIP>
> > > +static double rel_stddev(struct syscall_stats *stat)
> > > +{
> > > + double variance, average;
> > > +
> > > + if (stat->count < 2)
> > > + return 0;
> > > +
> > > + average = (double)stat->total_time / stat->count;
> > > +
> > > + variance = stat->squared_sum;
> > > + variance -= (stat->total_time * stat->total_time) / stat->count;
> > > + variance /= stat->count;
> >
> > isn't it 'variance /= stat->count - 1' because we used Bessel's
> > correction? (Link:
> > https://en.wikipedia.org/wiki/Bessel%27s_correction), that is to use n
> > - 1 instead of n, this is what's done in stat.c.
> >
> > * (\Sum n_i^2) - ((\Sum n_i)^2)/n
> > * s^2 = -------------------------------
> > * n - 1
> >
> > and the lines down here are unfortunately incorrect
> > + variance = stat->squared_sum;
> > + variance -= (stat->total_time * stat->total_time) / stat->count;
> > + variance /= stat->count;
> > +
> > + return 100 * sqrt(variance) / average;
> >
> > variance /= stat->count - 1; will get you variance, but I think we
> > need variance mean.
> > Link: https://en.wikipedia.org/wiki/Standard_deviation#Relationship_between_standard_deviation_and_mean
> >
> > it holds that:
> > variance(mean) = variance / N
> >
> > so you are losing a '/ stat->count'
>
> You're right, thanks for pointing that out.
>
> >
> > And with all due respect, although it makes total sense in
> > engineering, mathematically, I find variance = stat->squared_sum,
> > variance -= ... these accumulated calculations on variable 'variance'
> > a little weird... because readers may find difficult to determine at
> > which point it becomes the actual 'variance'
> >
> > with clarity in mind:
> >
> > diff --git a/tools/perf/util/bpf-trace-summary.c
> > b/tools/perf/util/bpf-trace-summary.c
> > index 5ae9feca244d..a435b4037082 100644
> > --- a/tools/perf/util/bpf-trace-summary.c
> > +++ b/tools/perf/util/bpf-trace-summary.c
> > @@ -62,18 +62,18 @@ struct syscall_node {
> >
> > static double rel_stddev(struct syscall_stats *stat)
> > {
> > - double variance, average;
> > + double variance, average, squared_total;
> >
> > if (stat->count < 2)
> > return 0;
> >
> > average = (double)stat->total_time / stat->count;
> >
> > - variance = stat->squared_sum;
> > - variance -= (stat->total_time * stat->total_time) / stat->count;
> > - variance /= stat->count;
> > + squared_total = stat->total_time * stat->total_time;
> > + variance = (stat->squared_sum - squared_total / stat->count) /
> > (stat->count - 1);
> > + stddev_mean = sqrt(variance / stat->count);
> >
> > - return 100 * sqrt(variance) / average;
> > + return 100 * stddev_mean / average;
> > }
>
> Can it be like this?
>
> diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
> index a91d42447e850a59..c897fb017914960c 100644
> --- a/tools/perf/util/bpf-trace-summary.c
> +++ b/tools/perf/util/bpf-trace-summary.c
> @@ -71,9 +71,9 @@ static double rel_stddev(struct syscall_stats *stat)
>
> variance = stat->squared_sum;
> variance -= (stat->total_time * stat->total_time) / stat->count;
> - variance /= stat->count;
> + variance /= stat->count - 1;
>
> - return 100 * sqrt(variance) / average;
> + return 100 * sqrt(variance / stat->count) / average;
> }
Of course. Then the variable 'variance' would mean 'variance of mean'.
>
> struct syscall_data {
>
> >
> > btw I haven't checked the legal range for stddev_mean, so I can be wrong.
> > <SNIP>
> > > +static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
> > > + struct syscall_stats *map_data)
> > > +{
> > > + struct syscall_data *data;
> > > + struct syscall_stats *stat;
> > > +
> > > + if (!hashmap__find(hash, map_key, &data)) {
> > > + data = zalloc(sizeof(*data));
> > > + if (data == NULL)
> > > + return -ENOMEM;
> > > +
> > > + data->nodes = zalloc(sizeof(*data->nodes));
> > > + if (data->nodes == NULL) {
> > > + free(data);
> > > + return -ENOMEM;
> > > + }
> > > +
> > > + data->nr_nodes = 1;
> > > + data->key = map_key->nr;
> > > + data->nodes->syscall_nr = data->key;
> > Wow, aggressive. I guess you want it to behave like a single value
> > when it is SYSCALL_AGGR_CPU, and an array when it is
> > SYSCALL_AGGR_THREAD. Do you mind adding a comment about it?
> >
> > so it's
> >
> > (cpu, syscall_nr) -> data -> {node}
> > (tid, syscall_nr) -> data -> [node1, node2, node3]
>
> Right, will add comments.
>
> >
> >
> > > +
> > > + if (hashmap__add(hash, data->key, data) < 0) {
> > > + free(data->nodes);
> > > + free(data);
> > > + return -ENOMEM;
> > > + }
> > > + }
> > > +
> > > + /* update total stats for this syscall */
> > > + data->nr_events += map_data->count;
> > > + data->total_time += map_data->total_time;
> > > +
> > > + /* This is sum of the same syscall from different CPUs */
> > > + stat = &data->nodes->stats;
> > > +
> > > + stat->total_time += map_data->total_time;
> > > + stat->squared_sum += map_data->squared_sum;
> > > + stat->count += map_data->count;
> > > + stat->error += map_data->error;
> > > +
> > > + if (stat->max_time < map_data->max_time)
> > > + stat->max_time = map_data->max_time;
> > > + if (stat->min_time > map_data->min_time)
> > > + stat->min_time = map_data->min_time;
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
> > > +{
> > > + int printed = 0;
> > > + int nr_events = 0;
> > > +
> > > + for (int i = 0; i < nr_data; i++)
> > > + nr_events += data[i]->nr_events;
> > > +
> > > + printed += fprintf(fp, " total, %d events\n\n", nr_events);
> > > +
> > > + printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
> > > + printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
> > > + printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
> > > +
> > > + for (int i = 0; i < nr_data; i++)
> > > + printed += print_common_stats(data[i], fp);
> > > +
> > > + printed += fprintf(fp, "\n\n");
> > > + return printed;
> > > +}
> > > +
> > > +int trace_print_bpf_summary(FILE *fp)
> > > +{
> > > + struct bpf_map *map = skel->maps.syscall_stats_map;
> > > + struct syscall_key *prev_key, key;
> > > + struct syscall_data **data = NULL;
> > > + struct hashmap schash;
> > > + struct hashmap_entry *entry;
> > > + int nr_data = 0;
> > > + int printed = 0;
> > > + int i;
> > > + size_t bkt;
> > > +
> > > + hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
> > > +
> > > + printed = fprintf(fp, "\n Summary of events:\n\n");
> > > +
> > > + /* get stats from the bpf map */
> > > + prev_key = NULL;
> > > + while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
> > > + struct syscall_stats stat;
> > > +
> > > + if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
> > > + if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> > > + update_thread_stats(&schash, &key, &stat);
> > > + else
> > > + update_total_stats(&schash, &key, &stat);
> > > + }
> > > +
> > > + prev_key = &key;
> > > + }
> > > +
> > > + nr_data = hashmap__size(&schash);
> > > + data = calloc(nr_data, sizeof(*data));
> > > + if (data == NULL)
> > > + goto out;
> > > +
> > > + i = 0;
> > > + hashmap__for_each_entry(&schash, entry, bkt)
> > > + data[i++] = entry->pvalue;
> > > +
> > > + qsort(data, nr_data, sizeof(*data), datacmp);
> >
> > Here syscall_data is sorted for AGGR_THREAD and AGGR_CPU, meaning the
> > thread who has the higher total syscall period will be printed first.
> > This is an awesome side effect but it is not the behavior of 'sudo
> > ./perf trace -as -- sleep 1' without the --bpf-summary option. If it
> > is not too trivial, maybe consider documenting this behavior? But it
> > may be too verbose so Idk.
>
> The original behavior is random ordering and now it's ordered by total
> time. But still we can think it's randomly ordered. I believe we need
> to maintain strict ordering by total time and then fixed the existing
> code. Once that happen I can add the documentation.
ofc.
>
> Thanks for your careful review!
You are welcome.
> Namhyung
Thanks,
Howard
Hello Namhyung,
Can you please rebase it? I cannot apply it, getting:
perf $ git apply --reject --whitespace=fix
./v2_20250317_namhyung_perf_trace_implement_syscall_summary_in_bpf.mbx
Checking patch tools/perf/Documentation/perf-trace.txt...
Checking patch tools/perf/Makefile.perf...
Hunk #1 succeeded at 1198 (offset -8 lines).
Checking patch tools/perf/builtin-trace.c...
error: while searching for:
bool hexret;
};
enum summary_mode {
SUMMARY__NONE = 0,
SUMMARY__BY_TOTAL,
SUMMARY__BY_THREAD,
};
struct trace {
struct perf_tool tool;
struct {
error: patch failed: tools/perf/builtin-trace.c:140
Sorry.
Thanks,
Howard
Hello Howard,
On Wed, Mar 19, 2025 at 12:00:10PM -0700, Howard Chu wrote:
> Hello Namhyung,
>
> Can you please rebase it? I cannot apply it, getting:
>
> perf $ git apply --reject --whitespace=fix
> ./v2_20250317_namhyung_perf_trace_implement_syscall_summary_in_bpf.mbx
> Checking patch tools/perf/Documentation/perf-trace.txt...
> Checking patch tools/perf/Makefile.perf...
> Hunk #1 succeeded at 1198 (offset -8 lines).
> Checking patch tools/perf/builtin-trace.c...
> error: while searching for:
> bool hexret;
> };
>
> enum summary_mode {
> SUMMARY__NONE = 0,
> SUMMARY__BY_TOTAL,
> SUMMARY__BY_THREAD,
> };
>
> struct trace {
> struct perf_tool tool;
> struct {
>
> error: patch failed: tools/perf/builtin-trace.c:140
Oops, I think I forgot to say it's on top of Ian's change.
Please try this first. Sorry for the confusion.
https://lore.kernel.org/r/20250319050741.269828-1-irogers@google.com
Thanks,
Namhyung
Hi Namhyung,
On Wed, Mar 19, 2025 at 3:07 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hello Howard,
>
> On Wed, Mar 19, 2025 at 12:00:10PM -0700, Howard Chu wrote:
> > Hello Namhyung,
> >
> > Can you please rebase it? I cannot apply it, getting:
> >
> > perf $ git apply --reject --whitespace=fix
> > ./v2_20250317_namhyung_perf_trace_implement_syscall_summary_in_bpf.mbx
> > Checking patch tools/perf/Documentation/perf-trace.txt...
> > Checking patch tools/perf/Makefile.perf...
> > Hunk #1 succeeded at 1198 (offset -8 lines).
> > Checking patch tools/perf/builtin-trace.c...
> > error: while searching for:
> > bool hexret;
> > };
> >
> > enum summary_mode {
> > SUMMARY__NONE = 0,
> > SUMMARY__BY_TOTAL,
> > SUMMARY__BY_THREAD,
> > };
> >
> > struct trace {
> > struct perf_tool tool;
> > struct {
> >
> > error: patch failed: tools/perf/builtin-trace.c:140
>
> Oops, I think I forgot to say it's on top of Ian's change.
> Please try this first. Sorry for the confusion.
>
> https://lore.kernel.org/r/20250319050741.269828-1-irogers@google.com
Yep, with Ian's patches it successfully applied. :)
Thanks,
Howard
>
> Thanks,
> Namhyung
>
Hi Namhyung,
I haven't finished the code review yet, but here are something that I
found interesting to share:
## 1
You used sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1 as
an example
If I use perf trace --bpf-summary without the '-a', that is to record
the process / task of 'sleep 1':
sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
It won't be recording this one process. So there should be a sentence
saying that bpf-summary only does system wide summaries.
## 2
there is a bug in min section, which made min always 0
you can see it in the sample output you provided above:
syscall calls errors total min avg
max stddev
(msec) (msec) (msec)
(msec) (%)
--------------- -------- ------ -------- --------- ---------
--------- ------
futex 372 18 4373.773 0.000 11.757
997.715 660.42%
poll 241 0 2757.963 0.000 11.444
997.758 580.34%
epoll_wait 161 0 2460.854 0.000 15.285
325.189 260.73%
ppoll 19 0 1298.652 0.000 68.350
667.172 281.46%
clock_nanosleep 1 0 1000.093 0.000 1000.093
1000.093 0.00%
epoll_pwait 16 0 192.787 0.000 12.049
173.994 348.73%
nanosleep 6 0 50.926 0.000 8.488
10.210 43.96%
clock_nanosleep has only 1 call so min can never be 0, it has to be
equal to the max and the mean.
This can be resolved by adding this line (same as what you did in the BPF code):
diff --git a/tools/perf/util/bpf-trace-summary.c
b/tools/perf/util/bpf-trace-summary.c
index 5ae9feca244d..eb98db7d6e33 100644
--- a/tools/perf/util/bpf-trace-summary.c
+++ b/tools/perf/util/bpf-trace-summary.c
@@ -243,7 +243,7 @@ static int update_total_stats(struct hashmap
*hash, struct syscall_key *map_key,
if (stat->max_time < map_data->max_time)
stat->max_time = map_data->max_time;
- if (stat->min_time > map_data->min_time)
+ if (stat->min_time > map_data->min_time || !stat->min_time)
stat->min_time = map_data->min_time;
return 0;
(sorry for the poor formatting from the gmail browser app)
## 3
Apologies for misunderstanding how the calculation of the 'standard
deviation of mean' works. You can decide what to do with it. :) Thanks
for the explanation in the thread of the previous version.
Thanks,
Howard
On Wed, Mar 19, 2025 at 3:19 PM Howard Chu <howardchu95@gmail.com> wrote:
>
> Hi Namhyung,
>
> On Wed, Mar 19, 2025 at 3:07 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > Hello Howard,
> >
> > On Wed, Mar 19, 2025 at 12:00:10PM -0700, Howard Chu wrote:
> > > Hello Namhyung,
> > >
> > > Can you please rebase it? I cannot apply it, getting:
> > >
> > > perf $ git apply --reject --whitespace=fix
> > > ./v2_20250317_namhyung_perf_trace_implement_syscall_summary_in_bpf.mbx
> > > Checking patch tools/perf/Documentation/perf-trace.txt...
> > > Checking patch tools/perf/Makefile.perf...
> > > Hunk #1 succeeded at 1198 (offset -8 lines).
> > > Checking patch tools/perf/builtin-trace.c...
> > > error: while searching for:
> > > bool hexret;
> > > };
> > >
> > > enum summary_mode {
> > > SUMMARY__NONE = 0,
> > > SUMMARY__BY_TOTAL,
> > > SUMMARY__BY_THREAD,
> > > };
> > >
> > > struct trace {
> > > struct perf_tool tool;
> > > struct {
> > >
> > > error: patch failed: tools/perf/builtin-trace.c:140
> >
> > Oops, I think I forgot to say it's on top of Ian's change.
> > Please try this first. Sorry for the confusion.
> >
> > https://lore.kernel.org/r/20250319050741.269828-1-irogers@google.com
>
> Yep, with Ian's patches it successfully applied. :)
>
> Thanks,
> Howard
> >
> > Thanks,
> > Namhyung
> >
On Wed, Mar 19, 2025 at 04:39:17PM -0700, Howard Chu wrote:
> Hi Namhyung,
>
> I haven't finished the code review yet, but here are something that I
> found interesting to share:
Thanks a lot for your review!
>
> ## 1
> You used sudo ./perf trace -as --bpf-summary --summary-mode=total -- sleep 1 as
> an example
>
> If I use perf trace --bpf-summary without the '-a', that is to record
> the process / task of 'sleep 1':
>
> sudo ./perf trace -s --bpf-summary --summary-mode=total -- sleep 1
>
> It won't be recording this one process. So there should be a sentence
> saying that bpf-summary only does system wide summaries.
Hmm.. you're right. I added per-thread summary but it still works for
system-wide only. I'll check the target and make it fail with an error
message if it's not in system-wide mode for now. I think we can add
support for other targets later.
>
> ## 2
> there is a bug in min section, which made min always 0
>
> you can see it in the sample output you provided above:
> syscall calls errors total min avg
> max stddev
> (msec) (msec) (msec)
> (msec) (%)
> --------------- -------- ------ -------- --------- ---------
> --------- ------
> futex 372 18 4373.773 0.000 11.757
> 997.715 660.42%
> poll 241 0 2757.963 0.000 11.444
> 997.758 580.34%
> epoll_wait 161 0 2460.854 0.000 15.285
> 325.189 260.73%
> ppoll 19 0 1298.652 0.000 68.350
> 667.172 281.46%
> clock_nanosleep 1 0 1000.093 0.000 1000.093
> 1000.093 0.00%
> epoll_pwait 16 0 192.787 0.000 12.049
> 173.994 348.73%
> nanosleep 6 0 50.926 0.000 8.488
> 10.210 43.96%
>
> clock_nanosleep has only 1 call so min can never be 0, it has to be
> equal to the max and the mean.
Oops, right.
>
> This can be resolved by adding this line (same as what you did in the BPF code):
>
> diff --git a/tools/perf/util/bpf-trace-summary.c
> b/tools/perf/util/bpf-trace-summary.c
> index 5ae9feca244d..eb98db7d6e33 100644
> --- a/tools/perf/util/bpf-trace-summary.c
> +++ b/tools/perf/util/bpf-trace-summary.c
> @@ -243,7 +243,7 @@ static int update_total_stats(struct hashmap
> *hash, struct syscall_key *map_key,
>
> if (stat->max_time < map_data->max_time)
> stat->max_time = map_data->max_time;
> - if (stat->min_time > map_data->min_time)
> + if (stat->min_time > map_data->min_time || !stat->min_time)
> stat->min_time = map_data->min_time;
>
> return 0;
>
> (sorry for the poor formatting from the gmail browser app)
No problem, I'll add this change.
>
> ## 3
> Apologies for misunderstanding how the calculation of the 'standard
> deviation of mean' works. You can decide what to do with it. :) Thanks
> for the explanation in the thread of the previous version.
No, it turns out that it can be calculated easily with the squared sum.
So I've added it.
Thanks,
Namhyung
>
> Thanks,
> Howard
>
> On Wed, Mar 19, 2025 at 3:19 PM Howard Chu <howardchu95@gmail.com> wrote:
> >
> > Hi Namhyung,
> >
> > On Wed, Mar 19, 2025 at 3:07 PM Namhyung Kim <namhyung@kernel.org> wrote:
> > >
> > > Hello Howard,
> > >
> > > On Wed, Mar 19, 2025 at 12:00:10PM -0700, Howard Chu wrote:
> > > > Hello Namhyung,
> > > >
> > > > Can you please rebase it? I cannot apply it, getting:
> > > >
> > > > perf $ git apply --reject --whitespace=fix
> > > > ./v2_20250317_namhyung_perf_trace_implement_syscall_summary_in_bpf.mbx
> > > > Checking patch tools/perf/Documentation/perf-trace.txt...
> > > > Checking patch tools/perf/Makefile.perf...
> > > > Hunk #1 succeeded at 1198 (offset -8 lines).
> > > > Checking patch tools/perf/builtin-trace.c...
> > > > error: while searching for:
> > > > bool hexret;
> > > > };
> > > >
> > > > enum summary_mode {
> > > > SUMMARY__NONE = 0,
> > > > SUMMARY__BY_TOTAL,
> > > > SUMMARY__BY_THREAD,
> > > > };
> > > >
> > > > struct trace {
> > > > struct perf_tool tool;
> > > > struct {
> > > >
> > > > error: patch failed: tools/perf/builtin-trace.c:140
> > >
> > > Oops, I think I forgot to say it's on top of Ian's change.
> > > Please try this first. Sorry for the confusion.
> > >
> > > https://lore.kernel.org/r/20250319050741.269828-1-irogers@google.com
> >
> > Yep, with Ian's patches it successfully applied. :)
> >
> > Thanks,
> > Howard
> > >
> > > Thanks,
> > > Namhyung
> > >
On Mon, Mar 17, 2025 at 11:08 AM Namhyung Kim <namhyung@kernel.org> wrote:
>
> When -s/--summary option is used, it doesn't need (augmented) arguments
> of syscalls. Let's skip the augmentation and load another small BPF
> program to collect the statistics in the kernel instead of copying the
> data to the ring-buffer to calculate the stats in userspace. This will
> be much more light-weight than the existing approach and remove any lost
> events.
>
> Let's add a new option --bpf-summary to control this behavior. I cannot
> make it default because there's no way to get e_machine in the BPF which
> is needed for detecting different ABIs like 32-bit compat mode.
>
> No functional changes intended except for no more LOST events. :)
>
> $ sudo perf trace -as --bpf-summary --summary-mode=total -- sleep 1
>
> Summary of events:
>
> total, 2824 events
>
> syscall calls errors total min avg max stddev
> (msec) (msec) (msec) (msec) (%)
> --------------- -------- ------ -------- --------- --------- --------- ------
> futex 372 18 4373.773 0.000 11.757 997.715 660.42%
> poll 241 0 2757.963 0.000 11.444 997.758 580.34%
> epoll_wait 161 0 2460.854 0.000 15.285 325.189 260.73%
> ppoll 19 0 1298.652 0.000 68.350 667.172 281.46%
> clock_nanosleep 1 0 1000.093 0.000 1000.093 1000.093 0.00%
> epoll_pwait 16 0 192.787 0.000 12.049 173.994 348.73%
> nanosleep 6 0 50.926 0.000 8.488 10.210 43.96%
> ...
>
> Cc: Howard Chu <howardchu95@gmail.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> v2)
> * rebased on top of Ian's e_machine changes
> * add --bpf-summary option
> * support per-thread summary
> * add stddev calculation (Howard)
>
> tools/perf/Documentation/perf-trace.txt | 6 +
> tools/perf/Makefile.perf | 2 +-
> tools/perf/builtin-trace.c | 43 ++-
> tools/perf/util/Build | 1 +
> tools/perf/util/bpf-trace-summary.c | 334 ++++++++++++++++++
> .../perf/util/bpf_skel/syscall_summary.bpf.c | 118 +++++++
> tools/perf/util/bpf_skel/syscall_summary.h | 25 ++
> tools/perf/util/trace.h | 37 ++
> 8 files changed, 553 insertions(+), 13 deletions(-)
> create mode 100644 tools/perf/util/bpf-trace-summary.c
> create mode 100644 tools/perf/util/bpf_skel/syscall_summary.bpf.c
> create mode 100644 tools/perf/util/bpf_skel/syscall_summary.h
> create mode 100644 tools/perf/util/trace.h
>
> diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> index 887dc37773d0f4d6..a8a0d8c33438fef7 100644
> --- a/tools/perf/Documentation/perf-trace.txt
> +++ b/tools/perf/Documentation/perf-trace.txt
> @@ -251,6 +251,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
> pretty-printing serves as a fallback to hand-crafted pretty printers, as the latter can
> better pretty-print integer flags and struct pointers.
>
> +--bpf-summary::
> + Collect system call statistics in BPF. This is only for live mode and
> + works well with -s/--summary option where no argument information is
> + required.
> +
> +
> PAGEFAULTS
> ----------
>
> diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
> index f3cd8de15d1a2681..d7a7e0c68fc10b8b 100644
> --- a/tools/perf/Makefile.perf
> +++ b/tools/perf/Makefile.perf
> @@ -1206,7 +1206,7 @@ SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
> SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
> SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
> SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
> -SKELETONS += $(SKEL_OUT)/kwork_top.skel.h
> +SKELETONS += $(SKEL_OUT)/kwork_top.skel.h $(SKEL_OUT)/syscall_summary.skel.h
> SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
> SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h
>
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index 7fef59306db2891f..deeb7250e8c52354 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -55,6 +55,7 @@
> #include "util/thread_map.h"
> #include "util/stat.h"
> #include "util/tool.h"
> +#include "util/trace.h"
> #include "util/util.h"
> #include "trace/beauty/beauty.h"
> #include "trace-event.h"
> @@ -141,12 +142,6 @@ struct syscall_fmt {
> bool hexret;
> };
>
> -enum summary_mode {
> - SUMMARY__NONE = 0,
> - SUMMARY__BY_TOTAL,
> - SUMMARY__BY_THREAD,
> -};
> -
> struct trace {
> struct perf_tool tool;
> struct {
> @@ -205,7 +200,7 @@ struct trace {
> } stats;
> unsigned int max_stack;
> unsigned int min_stack;
> - enum summary_mode summary_mode;
> + enum trace_summary_mode summary_mode;
> int raw_augmented_syscalls_args_size;
> bool raw_augmented_syscalls;
> bool fd_path_disabled;
> @@ -234,6 +229,7 @@ struct trace {
> bool force;
> bool vfs_getname;
> bool force_btf;
> + bool summary_bpf;
> int trace_pgfaults;
> char *perfconfig_events;
> struct {
> @@ -4356,6 +4352,13 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>
> trace->live = true;
>
> + if (trace->summary_bpf) {
> + if (trace_prepare_bpf_summary(trace->summary_mode) < 0)
> + goto out_delete_evlist;
> +
> + goto create_maps;
> + }
> +
> if (!trace->raw_augmented_syscalls) {
> if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
> goto out_error_raw_syscalls;
> @@ -4414,6 +4417,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> if (trace->cgroup)
> evlist__set_default_cgroup(trace->evlist, trace->cgroup);
>
> +create_maps:
> err = evlist__create_maps(evlist, &trace->opts.target);
> if (err < 0) {
> fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
> @@ -4426,7 +4430,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> goto out_delete_evlist;
> }
>
> - if (trace->summary_mode == SUMMARY__BY_TOTAL) {
> + if (trace->summary_mode == SUMMARY__BY_TOTAL && !trace->summary_bpf) {
> trace->syscall_stats = alloc_syscall_stats();
> if (trace->syscall_stats == NULL)
> goto out_delete_evlist;
> @@ -4512,9 +4516,11 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> if (err < 0)
> goto out_error_apply_filters;
>
> - err = evlist__mmap(evlist, trace->opts.mmap_pages);
> - if (err < 0)
> - goto out_error_mmap;
> + if (!trace->summary_bpf) {
> + err = evlist__mmap(evlist, trace->opts.mmap_pages);
> + if (err < 0)
> + goto out_error_mmap;
> + }
>
> if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
> evlist__enable(evlist);
> @@ -4527,6 +4533,9 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> evlist__enable(evlist);
> }
>
> + if (trace->summary_bpf)
> + trace_start_bpf_summary();
> +
> trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
> perf_thread_map__nr(evlist->core.threads) > 1 ||
> evlist__first(evlist)->core.attr.inherit;
> @@ -4594,12 +4603,17 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>
> evlist__disable(evlist);
>
> + if (trace->summary_bpf)
> + trace_end_bpf_summary();
> +
> if (trace->sort_events)
> ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
>
> if (!err) {
> if (trace->summary) {
> - if (trace->summary_mode == SUMMARY__BY_TOTAL)
> + if (trace->summary_bpf)
> + trace_print_bpf_summary(trace->output);
> + else if (trace->summary_mode == SUMMARY__BY_TOTAL)
> trace__fprintf_total_summary(trace, trace->output);
> else
> trace__fprintf_thread_summary(trace, trace->output);
> @@ -4615,6 +4629,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> }
>
> out_delete_evlist:
> + trace_cleanup_bpf_summary();
> delete_syscall_stats(trace->syscall_stats);
> trace__symbols__exit(trace);
> evlist__free_syscall_tp_fields(evlist);
> @@ -5444,6 +5459,7 @@ int cmd_trace(int argc, const char **argv)
> "start"),
> OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
> "to customized ones"),
> + OPT_BOOLEAN(0, "bpf-summary", &trace.summary_bpf, "Summary syscall stats in BPF"),
> OPTS_EVSWITCH(&trace.evswitch),
> OPT_END()
> };
> @@ -5535,6 +5551,9 @@ int cmd_trace(int argc, const char **argv)
> goto skip_augmentation;
> }
>
> + if (trace.summary_only && trace.summary_bpf)
> + goto skip_augmentation;
> +
> trace.skel = augmented_raw_syscalls_bpf__open();
> if (!trace.skel) {
> pr_debug("Failed to open augmented syscalls BPF skeleton");
> diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> index 034a6603d5a8e8b0..ba4201a6f3c69753 100644
> --- a/tools/perf/util/Build
> +++ b/tools/perf/util/Build
> @@ -171,6 +171,7 @@ perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf_off_cpu.o
> perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter.o
> perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-flex.o
> perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-bison.o
> +perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-trace-summary.o
> perf-util-$(CONFIG_PERF_BPF_SKEL) += btf.o
>
> ifeq ($(CONFIG_LIBTRACEEVENT),y)
> diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
> new file mode 100644
> index 0000000000000000..5ae9feca244d5b22
> --- /dev/null
> +++ b/tools/perf/util/bpf-trace-summary.c
> @@ -0,0 +1,334 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#include <inttypes.h>
> +#include <math.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +#include "dwarf-regs.h" /* for EM_HOST */
> +#include "syscalltbl.h"
> +#include "util/hashmap.h"
> +#include "util/trace.h"
> +#include "util/util.h"
> +#include <bpf/bpf.h>
> +#include <linux/time64.h>
> +#include <tools/libc_compat.h> /* reallocarray */
> +
> +#include "bpf_skel/syscall_summary.h"
> +#include "bpf_skel/syscall_summary.skel.h"
> +
> +
> +static struct syscall_summary_bpf *skel;
> +
> +int trace_prepare_bpf_summary(enum trace_summary_mode mode)
> +{
> + skel = syscall_summary_bpf__open();
> + if (skel == NULL) {
> + fprintf(stderr, "failed to open syscall summary bpf skeleton\n");
> + return -1;
> + }
> +
> + if (mode == SUMMARY__BY_THREAD)
> + skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
> + else
> + skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
> +
> + if (syscall_summary_bpf__load(skel) < 0) {
> + fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
> + return -1;
> + }
> +
> + if (syscall_summary_bpf__attach(skel) < 0) {
> + fprintf(stderr, "failed to attach syscall summary bpf skeleton\n");
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +void trace_start_bpf_summary(void)
> +{
> + skel->bss->enabled = 1;
> +}
> +
> +void trace_end_bpf_summary(void)
> +{
> + skel->bss->enabled = 0;
> +}
> +
> +struct syscall_node {
> + int syscall_nr;
> + struct syscall_stats stats;
> +};
> +
> +static double rel_stddev(struct syscall_stats *stat)
> +{
> + double variance, average;
> +
> + if (stat->count < 2)
> + return 0;
> +
> + average = (double)stat->total_time / stat->count;
> +
> + variance = stat->squared_sum;
> + variance -= (stat->total_time * stat->total_time) / stat->count;
> + variance /= stat->count;
> +
> + return 100 * sqrt(variance) / average;
> +}
> +
> +struct syscall_data {
> + int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
> + int nr_events;
> + int nr_nodes;
> + u64 total_time;
> + struct syscall_node *nodes;
> +};
> +
> +static int datacmp(const void *a, const void *b)
> +{
> + const struct syscall_data * const *sa = a;
> + const struct syscall_data * const *sb = b;
> +
> + return (*sa)->total_time > (*sb)->total_time ? -1 : 1;
> +}
> +
> +static int nodecmp(const void *a, const void *b)
> +{
> + const struct syscall_node *na = a;
> + const struct syscall_node *nb = b;
> +
> + return na->stats.total_time > nb->stats.total_time ? -1 : 1;
> +}
> +
> +static size_t sc_node_hash(long key, void *ctx __maybe_unused)
> +{
> + return key;
> +}
> +
> +static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused)
> +{
> + return key1 == key2;
> +}
> +
> +static int print_common_stats(struct syscall_data *data, FILE *fp)
> +{
> + int printed = 0;
> +
> + for (int i = 0; i < data->nr_nodes; i++) {
> + struct syscall_node *node = &data->nodes[i];
> + struct syscall_stats *stat = &node->stats;
> + double total = (double)(stat->total_time) / NSEC_PER_MSEC;
> + double min = (double)(stat->min_time) / NSEC_PER_MSEC;
> + double max = (double)(stat->max_time) / NSEC_PER_MSEC;
> + double avg = total / stat->count;
> + const char *name;
> +
> + /* TODO: support other ABIs */
> + name = syscalltbl__name(EM_HOST, node->syscall_nr);
> + if (name)
> + printed += fprintf(fp, " %-15s", name);
> + else
> + printed += fprintf(fp, " syscall:%-7d", node->syscall_nr);
> +
> + printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n",
> + stat->count, stat->error, total, min, avg, max,
> + rel_stddev(stat));
> + }
> + return printed;
> +}
> +
> +static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key,
> + struct syscall_stats *map_data)
> +{
> + struct syscall_data *data;
> + struct syscall_node *nodes;
> +
> + if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) {
> + data = zalloc(sizeof(*data));
> + if (data == NULL)
> + return -ENOMEM;
> +
> + data->key = map_key->cpu_or_tid;
> + if (hashmap__add(hash, data->key, data) < 0) {
> + free(data);
> + return -ENOMEM;
> + }
> + }
> +
> + /* update thread total stats */
> + data->nr_events += map_data->count;
> + data->total_time += map_data->total_time;
> +
> + nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
> + if (nodes == NULL)
> + return -ENOMEM;
> +
> + data->nodes = nodes;
> + nodes = &data->nodes[data->nr_nodes++];
> + nodes->syscall_nr = map_key->nr;
> +
> + /* each thread has an entry for each syscall, just use the stat */
> + memcpy(&nodes->stats, map_data, sizeof(*map_data));
> + return 0;
> +}
> +
> +static int print_thread_stat(struct syscall_data *data, FILE *fp)
> +{
> + int printed = 0;
> +
> + qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
> +
> + printed += fprintf(fp, " thread (%d), ", data->key);
> + printed += fprintf(fp, "%d events\n\n", data->nr_events);
> +
> + printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
> + printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
> + printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
The code lgtm and follows the code base's conventions but it seems odd
to put very specific output like this inside of util/ rather than
builtin-trace.c. stat-display.c is similar, it just seems the boundary
between util/ and the buitin-*.c files is blurry.
Thanks,
Ian
> +
> + printed += print_common_stats(data, fp);
> + printed += fprintf(fp, "\n\n");
> +
> + return printed;
> +}
> +
> +static int print_thread_stats(struct syscall_data **data, int nr_data, FILE *fp)
> +{
> + int printed = 0;
> +
> + for (int i = 0; i < nr_data; i++)
> + printed += print_thread_stat(data[i], fp);
> +
> + return printed;
> +}
> +
> +static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
> + struct syscall_stats *map_data)
> +{
> + struct syscall_data *data;
> + struct syscall_stats *stat;
> +
> + if (!hashmap__find(hash, map_key, &data)) {
> + data = zalloc(sizeof(*data));
> + if (data == NULL)
> + return -ENOMEM;
> +
> + data->nodes = zalloc(sizeof(*data->nodes));
> + if (data->nodes == NULL) {
> + free(data);
> + return -ENOMEM;
> + }
> +
> + data->nr_nodes = 1;
> + data->key = map_key->nr;
> + data->nodes->syscall_nr = data->key;
> +
> + if (hashmap__add(hash, data->key, data) < 0) {
> + free(data->nodes);
> + free(data);
> + return -ENOMEM;
> + }
> + }
> +
> + /* update total stats for this syscall */
> + data->nr_events += map_data->count;
> + data->total_time += map_data->total_time;
> +
> + /* This is sum of the same syscall from different CPUs */
> + stat = &data->nodes->stats;
> +
> + stat->total_time += map_data->total_time;
> + stat->squared_sum += map_data->squared_sum;
> + stat->count += map_data->count;
> + stat->error += map_data->error;
> +
> + if (stat->max_time < map_data->max_time)
> + stat->max_time = map_data->max_time;
> + if (stat->min_time > map_data->min_time)
> + stat->min_time = map_data->min_time;
> +
> + return 0;
> +}
> +
> +static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
> +{
> + int printed = 0;
> + int nr_events = 0;
> +
> + for (int i = 0; i < nr_data; i++)
> + nr_events += data[i]->nr_events;
> +
> + printed += fprintf(fp, " total, %d events\n\n", nr_events);
> +
> + printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
> + printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
> + printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
> +
> + for (int i = 0; i < nr_data; i++)
> + printed += print_common_stats(data[i], fp);
> +
> + printed += fprintf(fp, "\n\n");
> + return printed;
> +}
> +
> +int trace_print_bpf_summary(FILE *fp)
> +{
> + struct bpf_map *map = skel->maps.syscall_stats_map;
> + struct syscall_key *prev_key, key;
> + struct syscall_data **data = NULL;
> + struct hashmap schash;
> + struct hashmap_entry *entry;
> + int nr_data = 0;
> + int printed = 0;
> + int i;
> + size_t bkt;
> +
> + hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
> +
> + printed = fprintf(fp, "\n Summary of events:\n\n");
> +
> + /* get stats from the bpf map */
> + prev_key = NULL;
> + while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
> + struct syscall_stats stat;
> +
> + if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
> + if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> + update_thread_stats(&schash, &key, &stat);
> + else
> + update_total_stats(&schash, &key, &stat);
> + }
> +
> + prev_key = &key;
> + }
> +
> + nr_data = hashmap__size(&schash);
> + data = calloc(nr_data, sizeof(*data));
> + if (data == NULL)
> + goto out;
> +
> + i = 0;
> + hashmap__for_each_entry(&schash, entry, bkt)
> + data[i++] = entry->pvalue;
> +
> + qsort(data, nr_data, sizeof(*data), datacmp);
> +
> + if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> + printed += print_thread_stats(data, nr_data, fp);
> + else
> + printed += print_total_stats(data, nr_data, fp);
> +
> + for (i = 0; i < nr_data && data; i++) {
> + free(data[i]->nodes);
> + free(data[i]);
> + }
> + free(data);
> +
> +out:
> + hashmap__clear(&schash);
> + return printed;
> +}
> +
> +void trace_cleanup_bpf_summary(void)
> +{
> + syscall_summary_bpf__destroy(skel);
> +}
> diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
> new file mode 100644
> index 0000000000000000..b25f53b3c1351392
> --- /dev/null
> +++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
> @@ -0,0 +1,118 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Trace raw_syscalls tracepoints to collect system call statistics.
> + */
> +
> +#include "vmlinux.h"
> +#include "syscall_summary.h"
> +
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +/* This is to calculate a delta between sys-enter and sys-exit for each thread */
> +struct syscall_trace {
> + int nr; /* syscall number is only available at sys-enter */
> + int unused;
> + u64 timestamp;
> +};
> +
> +#define MAX_ENTRIES (128 * 1024)
> +
> +struct syscall_trace_map {
> + __uint(type, BPF_MAP_TYPE_HASH);
> + __type(key, int); /* tid */
> + __type(value, struct syscall_trace);
> + __uint(max_entries, MAX_ENTRIES);
> +} syscall_trace_map SEC(".maps");
> +
> +struct syscall_stats_map {
> + __uint(type, BPF_MAP_TYPE_HASH);
> + __type(key, struct syscall_key);
> + __type(value, struct syscall_stats);
> + __uint(max_entries, MAX_ENTRIES);
> +} syscall_stats_map SEC(".maps");
> +
> +int enabled; /* controlled from userspace */
> +
> +const volatile enum syscall_aggr_mode aggr_mode;
> +
> +static void update_stats(int cpu_or_tid, int nr, s64 duration, long ret)
> +{
> + struct syscall_key key = { .cpu_or_tid = cpu_or_tid, .nr = nr, };
> + struct syscall_stats *stats;
> +
> + stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
> + if (stats == NULL) {
> + struct syscall_stats zero = {};
> +
> + bpf_map_update_elem(&syscall_stats_map, &key, &zero, BPF_NOEXIST);
> + stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
> + if (stats == NULL)
> + return;
> + }
> +
> + __sync_fetch_and_add(&stats->count, 1);
> + if (ret < 0)
> + __sync_fetch_and_add(&stats->error, 1);
> +
> + if (duration > 0) {
> + __sync_fetch_and_add(&stats->total_time, duration);
> + __sync_fetch_and_add(&stats->squared_sum, duration * duration);
> + if (stats->max_time < duration)
> + stats->max_time = duration;
> + if (stats->min_time > duration || stats->min_time == 0)
> + stats->min_time = duration;
> + }
> +
> + return;
> +}
> +
> +SEC("tp_btf/sys_enter")
> +int sys_enter(u64 *ctx)
> +{
> + int tid;
> + struct syscall_trace st;
> +
> + if (!enabled)
> + return 0;
> +
> + st.nr = ctx[1]; /* syscall number */
> + st.unused = 0;
> + st.timestamp = bpf_ktime_get_ns();
> +
> + tid = bpf_get_current_pid_tgid();
> + bpf_map_update_elem(&syscall_trace_map, &tid, &st, BPF_ANY);
> +
> + return 0;
> +}
> +
> +SEC("tp_btf/sys_exit")
> +int sys_exit(u64 *ctx)
> +{
> + int tid;
> + int key;
> + long ret = ctx[1]; /* return value of the syscall */
> + struct syscall_trace *st;
> + s64 delta;
> +
> + if (!enabled)
> + return 0;
> +
> + tid = bpf_get_current_pid_tgid();
> + st = bpf_map_lookup_elem(&syscall_trace_map, &tid);
> + if (st == NULL)
> + return 0;
> +
> + if (aggr_mode == SYSCALL_AGGR_THREAD)
> + key = tid;
> + else
> + key = bpf_get_smp_processor_id();
> +
> + delta = bpf_ktime_get_ns() - st->timestamp;
> + update_stats(key, st->nr, delta, ret);
> +
> + bpf_map_delete_elem(&syscall_trace_map, &tid);
> + return 0;
> +}
> +
> +char _license[] SEC("license") = "GPL";
> diff --git a/tools/perf/util/bpf_skel/syscall_summary.h b/tools/perf/util/bpf_skel/syscall_summary.h
> new file mode 100644
> index 0000000000000000..17f9ecba657088aa
> --- /dev/null
> +++ b/tools/perf/util/bpf_skel/syscall_summary.h
> @@ -0,0 +1,25 @@
> +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +/* Data structures shared between BPF and tools. */
> +#ifndef UTIL_BPF_SKEL_SYSCALL_SUMMARY_H
> +#define UTIL_BPF_SKEL_SYSCALL_SUMMARY_H
> +
> +enum syscall_aggr_mode {
> + SYSCALL_AGGR_THREAD,
> + SYSCALL_AGGR_CPU,
> +};
> +
> +struct syscall_key {
> + int cpu_or_tid;
> + int nr;
> +};
> +
> +struct syscall_stats {
> + u64 total_time;
> + u64 squared_sum;
> + u64 max_time;
> + u64 min_time;
> + u32 count;
> + u32 error;
> +};
> +
> +#endif /* UTIL_BPF_SKEL_SYSCALL_SUMMARY_H */
> diff --git a/tools/perf/util/trace.h b/tools/perf/util/trace.h
> new file mode 100644
> index 0000000000000000..ef8361ed12c4edc1
> --- /dev/null
> +++ b/tools/perf/util/trace.h
> @@ -0,0 +1,37 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef UTIL_TRACE_H
> +#define UTIL_TRACE_H
> +
> +#include <stdio.h> /* for FILE */
> +
> +enum trace_summary_mode {
> + SUMMARY__NONE = 0,
> + SUMMARY__BY_TOTAL,
> + SUMMARY__BY_THREAD,
> +};
> +
> +#ifdef HAVE_BPF_SKEL
> +
> +int trace_prepare_bpf_summary(enum trace_summary_mode mode);
> +void trace_start_bpf_summary(void);
> +void trace_end_bpf_summary(void);
> +int trace_print_bpf_summary(FILE *fp);
> +void trace_cleanup_bpf_summary(void);
> +
> +#else /* !HAVE_BPF_SKEL */
> +
> +static inline int trace_prepare_bpf_summary(enum trace_summary_mode mode __maybe_unused)
> +{
> + return -1;
> +}
> +static inline void trace_start_bpf_summary(void) {}
> +static inline void trace_end_bpf_summary(void) {}
> +static inline int trace_print_bpf_summary(FILE *fp __maybe_unused)
> +{
> + return 0;
> +}
> +static inline void trace_cleanup_bpf_summary(void) {}
> +
> +#endif /* HAVE_BPF_SKEL */
> +
> +#endif /* UTIL_TRACE_H */
> --
> 2.49.0.rc1.451.g8f38331e32-goog
>
On Mon, Mar 17, 2025 at 11:37:22AM -0700, Ian Rogers wrote:
> On Mon, Mar 17, 2025 at 11:08 AM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > When -s/--summary option is used, it doesn't need (augmented) arguments
> > of syscalls. Let's skip the augmentation and load another small BPF
> > program to collect the statistics in the kernel instead of copying the
> > data to the ring-buffer to calculate the stats in userspace. This will
> > be much more light-weight than the existing approach and remove any lost
> > events.
> >
> > Let's add a new option --bpf-summary to control this behavior. I cannot
> > make it default because there's no way to get e_machine in the BPF which
> > is needed for detecting different ABIs like 32-bit compat mode.
> >
> > No functional changes intended except for no more LOST events. :)
> >
> > $ sudo perf trace -as --bpf-summary --summary-mode=total -- sleep 1
> >
> > Summary of events:
> >
> > total, 2824 events
> >
> > syscall calls errors total min avg max stddev
> > (msec) (msec) (msec) (msec) (%)
> > --------------- -------- ------ -------- --------- --------- --------- ------
> > futex 372 18 4373.773 0.000 11.757 997.715 660.42%
> > poll 241 0 2757.963 0.000 11.444 997.758 580.34%
> > epoll_wait 161 0 2460.854 0.000 15.285 325.189 260.73%
> > ppoll 19 0 1298.652 0.000 68.350 667.172 281.46%
> > clock_nanosleep 1 0 1000.093 0.000 1000.093 1000.093 0.00%
> > epoll_pwait 16 0 192.787 0.000 12.049 173.994 348.73%
> > nanosleep 6 0 50.926 0.000 8.488 10.210 43.96%
> > ...
> >
> > Cc: Howard Chu <howardchu95@gmail.com>
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> > v2)
> > * rebased on top of Ian's e_machine changes
> > * add --bpf-summary option
> > * support per-thread summary
> > * add stddev calculation (Howard)
> >
> > tools/perf/Documentation/perf-trace.txt | 6 +
> > tools/perf/Makefile.perf | 2 +-
> > tools/perf/builtin-trace.c | 43 ++-
> > tools/perf/util/Build | 1 +
> > tools/perf/util/bpf-trace-summary.c | 334 ++++++++++++++++++
> > .../perf/util/bpf_skel/syscall_summary.bpf.c | 118 +++++++
> > tools/perf/util/bpf_skel/syscall_summary.h | 25 ++
> > tools/perf/util/trace.h | 37 ++
> > 8 files changed, 553 insertions(+), 13 deletions(-)
> > create mode 100644 tools/perf/util/bpf-trace-summary.c
> > create mode 100644 tools/perf/util/bpf_skel/syscall_summary.bpf.c
> > create mode 100644 tools/perf/util/bpf_skel/syscall_summary.h
> > create mode 100644 tools/perf/util/trace.h
> >
> > diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> > index 887dc37773d0f4d6..a8a0d8c33438fef7 100644
> > --- a/tools/perf/Documentation/perf-trace.txt
> > +++ b/tools/perf/Documentation/perf-trace.txt
> > @@ -251,6 +251,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
> > pretty-printing serves as a fallback to hand-crafted pretty printers, as the latter can
> > better pretty-print integer flags and struct pointers.
> >
> > +--bpf-summary::
> > + Collect system call statistics in BPF. This is only for live mode and
> > + works well with -s/--summary option where no argument information is
> > + required.
> > +
> > +
> > PAGEFAULTS
> > ----------
> >
> > diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
> > index f3cd8de15d1a2681..d7a7e0c68fc10b8b 100644
> > --- a/tools/perf/Makefile.perf
> > +++ b/tools/perf/Makefile.perf
> > @@ -1206,7 +1206,7 @@ SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
> > SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
> > SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
> > SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
> > -SKELETONS += $(SKEL_OUT)/kwork_top.skel.h
> > +SKELETONS += $(SKEL_OUT)/kwork_top.skel.h $(SKEL_OUT)/syscall_summary.skel.h
> > SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
> > SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h
> >
> > diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> > index 7fef59306db2891f..deeb7250e8c52354 100644
> > --- a/tools/perf/builtin-trace.c
> > +++ b/tools/perf/builtin-trace.c
> > @@ -55,6 +55,7 @@
> > #include "util/thread_map.h"
> > #include "util/stat.h"
> > #include "util/tool.h"
> > +#include "util/trace.h"
> > #include "util/util.h"
> > #include "trace/beauty/beauty.h"
> > #include "trace-event.h"
> > @@ -141,12 +142,6 @@ struct syscall_fmt {
> > bool hexret;
> > };
> >
> > -enum summary_mode {
> > - SUMMARY__NONE = 0,
> > - SUMMARY__BY_TOTAL,
> > - SUMMARY__BY_THREAD,
> > -};
> > -
> > struct trace {
> > struct perf_tool tool;
> > struct {
> > @@ -205,7 +200,7 @@ struct trace {
> > } stats;
> > unsigned int max_stack;
> > unsigned int min_stack;
> > - enum summary_mode summary_mode;
> > + enum trace_summary_mode summary_mode;
> > int raw_augmented_syscalls_args_size;
> > bool raw_augmented_syscalls;
> > bool fd_path_disabled;
> > @@ -234,6 +229,7 @@ struct trace {
> > bool force;
> > bool vfs_getname;
> > bool force_btf;
> > + bool summary_bpf;
> > int trace_pgfaults;
> > char *perfconfig_events;
> > struct {
> > @@ -4356,6 +4352,13 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> >
> > trace->live = true;
> >
> > + if (trace->summary_bpf) {
> > + if (trace_prepare_bpf_summary(trace->summary_mode) < 0)
> > + goto out_delete_evlist;
> > +
> > + goto create_maps;
> > + }
> > +
> > if (!trace->raw_augmented_syscalls) {
> > if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
> > goto out_error_raw_syscalls;
> > @@ -4414,6 +4417,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> > if (trace->cgroup)
> > evlist__set_default_cgroup(trace->evlist, trace->cgroup);
> >
> > +create_maps:
> > err = evlist__create_maps(evlist, &trace->opts.target);
> > if (err < 0) {
> > fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
> > @@ -4426,7 +4430,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> > goto out_delete_evlist;
> > }
> >
> > - if (trace->summary_mode == SUMMARY__BY_TOTAL) {
> > + if (trace->summary_mode == SUMMARY__BY_TOTAL && !trace->summary_bpf) {
> > trace->syscall_stats = alloc_syscall_stats();
> > if (trace->syscall_stats == NULL)
> > goto out_delete_evlist;
> > @@ -4512,9 +4516,11 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> > if (err < 0)
> > goto out_error_apply_filters;
> >
> > - err = evlist__mmap(evlist, trace->opts.mmap_pages);
> > - if (err < 0)
> > - goto out_error_mmap;
> > + if (!trace->summary_bpf) {
> > + err = evlist__mmap(evlist, trace->opts.mmap_pages);
> > + if (err < 0)
> > + goto out_error_mmap;
> > + }
> >
> > if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
> > evlist__enable(evlist);
> > @@ -4527,6 +4533,9 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> > evlist__enable(evlist);
> > }
> >
> > + if (trace->summary_bpf)
> > + trace_start_bpf_summary();
> > +
> > trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
> > perf_thread_map__nr(evlist->core.threads) > 1 ||
> > evlist__first(evlist)->core.attr.inherit;
> > @@ -4594,12 +4603,17 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> >
> > evlist__disable(evlist);
> >
> > + if (trace->summary_bpf)
> > + trace_end_bpf_summary();
> > +
> > if (trace->sort_events)
> > ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
> >
> > if (!err) {
> > if (trace->summary) {
> > - if (trace->summary_mode == SUMMARY__BY_TOTAL)
> > + if (trace->summary_bpf)
> > + trace_print_bpf_summary(trace->output);
> > + else if (trace->summary_mode == SUMMARY__BY_TOTAL)
> > trace__fprintf_total_summary(trace, trace->output);
> > else
> > trace__fprintf_thread_summary(trace, trace->output);
> > @@ -4615,6 +4629,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> > }
> >
> > out_delete_evlist:
> > + trace_cleanup_bpf_summary();
> > delete_syscall_stats(trace->syscall_stats);
> > trace__symbols__exit(trace);
> > evlist__free_syscall_tp_fields(evlist);
> > @@ -5444,6 +5459,7 @@ int cmd_trace(int argc, const char **argv)
> > "start"),
> > OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
> > "to customized ones"),
> > + OPT_BOOLEAN(0, "bpf-summary", &trace.summary_bpf, "Summary syscall stats in BPF"),
> > OPTS_EVSWITCH(&trace.evswitch),
> > OPT_END()
> > };
> > @@ -5535,6 +5551,9 @@ int cmd_trace(int argc, const char **argv)
> > goto skip_augmentation;
> > }
> >
> > + if (trace.summary_only && trace.summary_bpf)
> > + goto skip_augmentation;
> > +
> > trace.skel = augmented_raw_syscalls_bpf__open();
> > if (!trace.skel) {
> > pr_debug("Failed to open augmented syscalls BPF skeleton");
> > diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> > index 034a6603d5a8e8b0..ba4201a6f3c69753 100644
> > --- a/tools/perf/util/Build
> > +++ b/tools/perf/util/Build
> > @@ -171,6 +171,7 @@ perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf_off_cpu.o
> > perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter.o
> > perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-flex.o
> > perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-bison.o
> > +perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-trace-summary.o
> > perf-util-$(CONFIG_PERF_BPF_SKEL) += btf.o
> >
> > ifeq ($(CONFIG_LIBTRACEEVENT),y)
> > diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
> > new file mode 100644
> > index 0000000000000000..5ae9feca244d5b22
> > --- /dev/null
> > +++ b/tools/perf/util/bpf-trace-summary.c
> > @@ -0,0 +1,334 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#include <inttypes.h>
> > +#include <math.h>
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +
> > +#include "dwarf-regs.h" /* for EM_HOST */
> > +#include "syscalltbl.h"
> > +#include "util/hashmap.h"
> > +#include "util/trace.h"
> > +#include "util/util.h"
> > +#include <bpf/bpf.h>
> > +#include <linux/time64.h>
> > +#include <tools/libc_compat.h> /* reallocarray */
> > +
> > +#include "bpf_skel/syscall_summary.h"
> > +#include "bpf_skel/syscall_summary.skel.h"
> > +
> > +
> > +static struct syscall_summary_bpf *skel;
> > +
> > +int trace_prepare_bpf_summary(enum trace_summary_mode mode)
> > +{
> > + skel = syscall_summary_bpf__open();
> > + if (skel == NULL) {
> > + fprintf(stderr, "failed to open syscall summary bpf skeleton\n");
> > + return -1;
> > + }
> > +
> > + if (mode == SUMMARY__BY_THREAD)
> > + skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
> > + else
> > + skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
> > +
> > + if (syscall_summary_bpf__load(skel) < 0) {
> > + fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
> > + return -1;
> > + }
> > +
> > + if (syscall_summary_bpf__attach(skel) < 0) {
> > + fprintf(stderr, "failed to attach syscall summary bpf skeleton\n");
> > + return -1;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +void trace_start_bpf_summary(void)
> > +{
> > + skel->bss->enabled = 1;
> > +}
> > +
> > +void trace_end_bpf_summary(void)
> > +{
> > + skel->bss->enabled = 0;
> > +}
> > +
> > +struct syscall_node {
> > + int syscall_nr;
> > + struct syscall_stats stats;
> > +};
> > +
> > +static double rel_stddev(struct syscall_stats *stat)
> > +{
> > + double variance, average;
> > +
> > + if (stat->count < 2)
> > + return 0;
> > +
> > + average = (double)stat->total_time / stat->count;
> > +
> > + variance = stat->squared_sum;
> > + variance -= (stat->total_time * stat->total_time) / stat->count;
> > + variance /= stat->count;
> > +
> > + return 100 * sqrt(variance) / average;
> > +}
> > +
> > +struct syscall_data {
> > + int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
> > + int nr_events;
> > + int nr_nodes;
> > + u64 total_time;
> > + struct syscall_node *nodes;
> > +};
> > +
> > +static int datacmp(const void *a, const void *b)
> > +{
> > + const struct syscall_data * const *sa = a;
> > + const struct syscall_data * const *sb = b;
> > +
> > + return (*sa)->total_time > (*sb)->total_time ? -1 : 1;
> > +}
> > +
> > +static int nodecmp(const void *a, const void *b)
> > +{
> > + const struct syscall_node *na = a;
> > + const struct syscall_node *nb = b;
> > +
> > + return na->stats.total_time > nb->stats.total_time ? -1 : 1;
> > +}
> > +
> > +static size_t sc_node_hash(long key, void *ctx __maybe_unused)
> > +{
> > + return key;
> > +}
> > +
> > +static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused)
> > +{
> > + return key1 == key2;
> > +}
> > +
> > +static int print_common_stats(struct syscall_data *data, FILE *fp)
> > +{
> > + int printed = 0;
> > +
> > + for (int i = 0; i < data->nr_nodes; i++) {
> > + struct syscall_node *node = &data->nodes[i];
> > + struct syscall_stats *stat = &node->stats;
> > + double total = (double)(stat->total_time) / NSEC_PER_MSEC;
> > + double min = (double)(stat->min_time) / NSEC_PER_MSEC;
> > + double max = (double)(stat->max_time) / NSEC_PER_MSEC;
> > + double avg = total / stat->count;
> > + const char *name;
> > +
> > + /* TODO: support other ABIs */
> > + name = syscalltbl__name(EM_HOST, node->syscall_nr);
> > + if (name)
> > + printed += fprintf(fp, " %-15s", name);
> > + else
> > + printed += fprintf(fp, " syscall:%-7d", node->syscall_nr);
> > +
> > + printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n",
> > + stat->count, stat->error, total, min, avg, max,
> > + rel_stddev(stat));
> > + }
> > + return printed;
> > +}
> > +
> > +static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key,
> > + struct syscall_stats *map_data)
> > +{
> > + struct syscall_data *data;
> > + struct syscall_node *nodes;
> > +
> > + if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) {
> > + data = zalloc(sizeof(*data));
> > + if (data == NULL)
> > + return -ENOMEM;
> > +
> > + data->key = map_key->cpu_or_tid;
> > + if (hashmap__add(hash, data->key, data) < 0) {
> > + free(data);
> > + return -ENOMEM;
> > + }
> > + }
> > +
> > + /* update thread total stats */
> > + data->nr_events += map_data->count;
> > + data->total_time += map_data->total_time;
> > +
> > + nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
> > + if (nodes == NULL)
> > + return -ENOMEM;
> > +
> > + data->nodes = nodes;
> > + nodes = &data->nodes[data->nr_nodes++];
> > + nodes->syscall_nr = map_key->nr;
> > +
> > + /* each thread has an entry for each syscall, just use the stat */
> > + memcpy(&nodes->stats, map_data, sizeof(*map_data));
> > + return 0;
> > +}
> > +
> > +static int print_thread_stat(struct syscall_data *data, FILE *fp)
> > +{
> > + int printed = 0;
> > +
> > + qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
> > +
> > + printed += fprintf(fp, " thread (%d), ", data->key);
> > + printed += fprintf(fp, "%d events\n\n", data->nr_events);
> > +
> > + printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
> > + printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
> > + printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
>
> The code lgtm and follows the code base's conventions but it seems odd
> to put very specific output like this inside of util/ rather than
> builtin-trace.c. stat-display.c is similar, it just seems the boundary
> between util/ and the buitin-*.c files is blurry.
I see your point and totally agree. Ideally the print code should be
in the builtin-trace.c and this file contains BPF related parts only.
But it requires more refactoring to clearly separate logic that can be
shared in the util/ code. Currently this code is large and hard to
factor out some parts. I think we need to move many code to util/.
Maybe I can work on it in the near future to add more changes to perf
trace. But for now I'd like to make this change small and independent
as much as possible. Is it ok to you?
Thanks,
Namhyung
© 2016 - 2025 Red Hat, Inc.