Add perf_event_array map for dumping direct off-cpu samples, but keep
the at-the-end approach.
Tons of checking before access, to pass the BPF verifier.
If off-cpu time (represented as delta) exceeds the off-cpu threshold, do
output.
Output PERF_SAMPLE_TID, PERF_SAMPLE_PERIOD, PERF_SAMPLE_CALLCHAIN, and
PERF_SAMPLE_CGROUP in bpf_perf_event_output().
Ideally, we should only output
PERF_SAMPLE_PERIOD (off-cpu time) and PERF_SAMPLE_CALLCHAIN (sched_in
process's callchain). One only needs to set PERF_SAMPLE_TID and
PERF_SAMPLE_CGROUP, and perf_event will do everything for us.
But in reality, that's not the case. Setting PERF_SAMPLE_TID will mostly
give us TID of 0. We might get the correct TID for offcpu-time event
from time to time, but it is really rare.
swapper 0 [000] offcpu-time: /
:1321819 1321819 [002] offcpu-time: /user.slice/user-1000.slice/session-2.scope
swapper 0 [001] offcpu-time: /
swapper 0 [003] offcpu-time: /
And setting PERF_SAMPLE_CGROUP doesn't work properly either.
tmux: server 3701 [003] offcpu-time: /
blueman-tray 1064 [001] offcpu-time: /
bash 1350867 [001] offcpu-time: /
bash 1350844 [000] offcpu-time: /
We need to retrieve PERF_SAMPLE_TID, PERF_SAMPLE_PERIOD,
PERF_SAMPLE_CALLCHAIN, and PERF_SAMPLE_CGROUP using BPF and output these
four fields.
Suggested-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Howard Chu <howardchu95@gmail.com>
---
tools/perf/util/bpf_skel/off_cpu.bpf.c | 112 +++++++++++++++++++++++++
tools/perf/util/off_cpu.h | 8 +-
2 files changed, 119 insertions(+), 1 deletion(-)
diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
index e2a887228fd9..c42d0e2d91d8 100644
--- a/tools/perf/util/bpf_skel/off_cpu.bpf.c
+++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
@@ -19,6 +19,7 @@
#define MAX_ENTRIES 102400
#define MAX_CPUS 4096
+#define MAX_OFFCPU_LEN 128
struct tstamp_data {
__u32 stack_id;
@@ -34,6 +35,7 @@ struct offcpu_key {
__u64 cgroup_id;
};
+/* for dumping at the end */
struct {
__uint(type, BPF_MAP_TYPE_STACK_TRACE);
__uint(key_size, sizeof(__u32));
@@ -41,6 +43,14 @@ struct {
__uint(max_entries, MAX_ENTRIES);
} stacks SEC(".maps");
+struct offcpu_data {
+ u64 array[MAX_OFFCPU_LEN];
+};
+
+struct stack_data {
+ u64 array[MAX_STACKS];
+};
+
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(__u32));
@@ -48,6 +58,22 @@ struct {
__uint(max_entries, MAX_CPUS);
} offcpu_output SEC(".maps");
+/* temporary offcpu sample */
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct offcpu_data));
+ __uint(max_entries, 1);
+} offcpu_payload SEC(".maps");
+
+/* cached stack per task storage */
+struct {
+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct stack_data);
+} stack_cache SEC(".maps");
+
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
@@ -194,12 +220,75 @@ static inline int can_record(struct task_struct *t, int state)
return 1;
}
+static inline bool check_bounds(int index)
+{
+ if (index < 0 || index >= MAX_OFFCPU_LEN)
+ return false;
+
+ return true;
+}
+
+static inline int copy_stack(struct stack_data *from,
+ struct offcpu_data *to, int n)
+{
+ int max_stacks = MAX_STACKS, len = 0;
+
+ if (!from)
+ return len;
+
+ for (int i = 0; i < max_stacks && from->array[i]; ++i) {
+ if (check_bounds(n + 2 + i)) {
+ to->array[n + 2 + i] = from->array[i];
+ ++len;
+ }
+ }
+ return len;
+}
+
+static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key,
+ struct stack_data *stack_p, __u64 delta, __u64 timestamp)
+{
+ int size, n = 0, ip_pos = -1, len = 0;
+
+ if (sample_type & PERF_SAMPLE_TID && check_bounds(n))
+ data->array[n++] = (u64)key->tgid << 32 | key->pid;
+ if (sample_type & PERF_SAMPLE_PERIOD && check_bounds(n))
+ data->array[n++] = delta;
+ if (sample_type & PERF_SAMPLE_CALLCHAIN && check_bounds(n + 2)) {
+ /* data->array[n] is callchain->nr (updated later) */
+ data->array[n + 1] = PERF_CONTEXT_USER;
+ data->array[n + 2] = 0;
+
+ len = copy_stack(stack_p, data, n);
+
+ /* update length of callchain */
+ data->array[n] = len + 1;
+
+ /* update sample ip with the first callchain entry */
+ if (ip_pos >= 0)
+ data->array[ip_pos] = data->array[n + 2];
+
+ /* calculate sample callchain data->array length */
+ n += len + 2;
+ }
+ if (sample_type & PERF_SAMPLE_CGROUP && check_bounds(n))
+ data->array[n++] = key->cgroup_id;
+
+ size = n * sizeof(u64);
+ if (size >= 0 && size <= MAX_OFFCPU_LEN * sizeof(u64))
+ bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, size);
+
+ return 0;
+}
+
static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
struct task_struct *next, int state)
{
__u64 ts;
__u32 stack_id;
struct tstamp_data *pelem;
+ struct stack_data *stack_p;
+ int zero = 0;
ts = bpf_ktime_get_ns();
@@ -209,6 +298,21 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
stack_id = bpf_get_stackid(ctx, &stacks,
BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
+ /*
+ * if stacks are successfully collected, cache them to task_storage, they are then
+ * dumped if the off-cpu time hits the threshold.
+ */
+ if (stack_id > 0) {
+ stack_p = bpf_task_storage_get(&stack_cache, prev, NULL,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (stack_p) {
+ /* to pass the clang result unused warning */
+ int __attribute__((unused)) len;
+ len = bpf_get_stack(ctx, stack_p->array, MAX_STACKS * sizeof(u64),
+ BPF_F_USER_STACK) / sizeof(u64);
+ }
+ }
+
pelem = bpf_task_storage_get(&tstamp, prev, NULL,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (!pelem)
@@ -238,6 +342,14 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
else
bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
+ if (delta >= offcpu_thresh) {
+ struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero);
+
+ stack_p = bpf_task_storage_get(&stack_cache, next, NULL, 0);
+ if (data && stack_p)
+ off_cpu_dump(ctx, data, &key, stack_p, delta, pelem->timestamp);
+ }
+
/* prevent to reuse the timestamp later */
pelem->timestamp = 0;
}
diff --git a/tools/perf/util/off_cpu.h b/tools/perf/util/off_cpu.h
index 357231cb1c38..eaf7be92472d 100644
--- a/tools/perf/util/off_cpu.h
+++ b/tools/perf/util/off_cpu.h
@@ -15,9 +15,15 @@ struct record_opts;
#define OFFCPU_SAMPLE_TYPES (PERF_SAMPLE_IDENTIFIER | PERF_SAMPLE_IP | \
PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
PERF_SAMPLE_ID | PERF_SAMPLE_CPU | \
- PERF_SAMPLE_PERIOD | PERF_SAMPLE_CALLCHAIN | \
+ PERF_SAMPLE_PERIOD | PERF_SAMPLE_RAW | \
PERF_SAMPLE_CGROUP)
+/*
+ * for embedded data to overwrite the original sample, duplicated sample types
+ * must be set in the original OFFCPU_SAMPLE_TYPES, except for callchain.
+ */
+#define OFFCPU_EMBEDDED_SAMPLE_TYPES (PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD | \
+ PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_CGROUP)
#ifdef HAVE_BPF_SKEL
int off_cpu_prepare(struct evlist *evlist, struct target *target,
--
2.43.0
On Fri, Sep 27, 2024 at 01:27:32PM -0700, Howard Chu wrote: > Add perf_event_array map for dumping direct off-cpu samples, but keep > the at-the-end approach. > > Tons of checking before access, to pass the BPF verifier. > > If off-cpu time (represented as delta) exceeds the off-cpu threshold, do > output. > > Output PERF_SAMPLE_TID, PERF_SAMPLE_PERIOD, PERF_SAMPLE_CALLCHAIN, and > PERF_SAMPLE_CGROUP in bpf_perf_event_output(). > > Ideally, we should only output > PERF_SAMPLE_PERIOD (off-cpu time) and PERF_SAMPLE_CALLCHAIN (sched_in > process's callchain). One only needs to set PERF_SAMPLE_TID and > PERF_SAMPLE_CGROUP, and perf_event will do everything for us. > > But in reality, that's not the case. Setting PERF_SAMPLE_TID will mostly > give us TID of 0. We might get the correct TID for offcpu-time event > from time to time, but it is really rare. > swapper 0 [000] offcpu-time: / > :1321819 1321819 [002] offcpu-time: /user.slice/user-1000.slice/session-2.scope > swapper 0 [001] offcpu-time: / > swapper 0 [003] offcpu-time: / > > And setting PERF_SAMPLE_CGROUP doesn't work properly either. > tmux: server 3701 [003] offcpu-time: / > blueman-tray 1064 [001] offcpu-time: / > bash 1350867 [001] offcpu-time: / > bash 1350844 [000] offcpu-time: / > > We need to retrieve PERF_SAMPLE_TID, PERF_SAMPLE_PERIOD, > PERF_SAMPLE_CALLCHAIN, and PERF_SAMPLE_CGROUP using BPF and output these > four fields. > > Suggested-by: Namhyung Kim <namhyung@kernel.org> > Signed-off-by: Howard Chu <howardchu95@gmail.com> > --- > tools/perf/util/bpf_skel/off_cpu.bpf.c | 112 +++++++++++++++++++++++++ > tools/perf/util/off_cpu.h | 8 +- > 2 files changed, 119 insertions(+), 1 deletion(-) > > diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c > index e2a887228fd9..c42d0e2d91d8 100644 > --- a/tools/perf/util/bpf_skel/off_cpu.bpf.c > +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c > @@ -19,6 +19,7 @@ > #define MAX_ENTRIES 102400 > > #define MAX_CPUS 4096 > +#define MAX_OFFCPU_LEN 128 I guess it's too big as you only collect stack and a few more data. > > struct tstamp_data { > __u32 stack_id; > @@ -34,6 +35,7 @@ struct offcpu_key { > __u64 cgroup_id; > }; > > +/* for dumping at the end */ > struct { > __uint(type, BPF_MAP_TYPE_STACK_TRACE); > __uint(key_size, sizeof(__u32)); > @@ -41,6 +43,14 @@ struct { > __uint(max_entries, MAX_ENTRIES); > } stacks SEC(".maps"); > > +struct offcpu_data { > + u64 array[MAX_OFFCPU_LEN]; > +}; > + > +struct stack_data { > + u64 array[MAX_STACKS]; > +}; > + > struct { > __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); > __uint(key_size, sizeof(__u32)); > @@ -48,6 +58,22 @@ struct { > __uint(max_entries, MAX_CPUS); > } offcpu_output SEC(".maps"); > > +/* temporary offcpu sample */ > +struct { > + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); > + __uint(key_size, sizeof(__u32)); > + __uint(value_size, sizeof(struct offcpu_data)); > + __uint(max_entries, 1); > +} offcpu_payload SEC(".maps"); > + > +/* cached stack per task storage */ > +struct { > + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); > + __uint(map_flags, BPF_F_NO_PREALLOC); > + __type(key, int); > + __type(value, struct stack_data); > +} stack_cache SEC(".maps"); > + > struct { > __uint(type, BPF_MAP_TYPE_TASK_STORAGE); > __uint(map_flags, BPF_F_NO_PREALLOC); > @@ -194,12 +220,75 @@ static inline int can_record(struct task_struct *t, int state) > return 1; > } > > +static inline bool check_bounds(int index) > +{ > + if (index < 0 || index >= MAX_OFFCPU_LEN) > + return false; > + > + return true; > +} Likewise, we may get rid of the bound check entirely as it's always guaranteed. > + > +static inline int copy_stack(struct stack_data *from, > + struct offcpu_data *to, int n) > +{ > + int max_stacks = MAX_STACKS, len = 0; > + > + if (!from) > + return len; > + > + for (int i = 0; i < max_stacks && from->array[i]; ++i) { > + if (check_bounds(n + 2 + i)) { > + to->array[n + 2 + i] = from->array[i]; > + ++len; > + } > + } > + return len; > +} > + > +static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key, > + struct stack_data *stack_p, __u64 delta, __u64 timestamp) > +{ > + int size, n = 0, ip_pos = -1, len = 0; > + > + if (sample_type & PERF_SAMPLE_TID && check_bounds(n)) You didn't set the sample_type yet. Is it intentional? > + data->array[n++] = (u64)key->tgid << 32 | key->pid; > + if (sample_type & PERF_SAMPLE_PERIOD && check_bounds(n)) > + data->array[n++] = delta; > + if (sample_type & PERF_SAMPLE_CALLCHAIN && check_bounds(n + 2)) { > + /* data->array[n] is callchain->nr (updated later) */ > + data->array[n + 1] = PERF_CONTEXT_USER; > + data->array[n + 2] = 0; > + > + len = copy_stack(stack_p, data, n); > + > + /* update length of callchain */ > + data->array[n] = len + 1; > + > + /* update sample ip with the first callchain entry */ > + if (ip_pos >= 0) You don't set the ip_pos, just remove this part and let userspace handle it later. > + data->array[ip_pos] = data->array[n + 2]; > + > + /* calculate sample callchain data->array length */ > + n += len + 2; > + } > + if (sample_type & PERF_SAMPLE_CGROUP && check_bounds(n)) > + data->array[n++] = key->cgroup_id; > + > + size = n * sizeof(u64); > + if (size >= 0 && size <= MAX_OFFCPU_LEN * sizeof(u64)) > + bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, size); > + > + return 0; > +} > + > static int off_cpu_stat(u64 *ctx, struct task_struct *prev, > struct task_struct *next, int state) > { > __u64 ts; > __u32 stack_id; > struct tstamp_data *pelem; > + struct stack_data *stack_p; > + int zero = 0; > > ts = bpf_ktime_get_ns(); > > @@ -209,6 +298,21 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev, > stack_id = bpf_get_stackid(ctx, &stacks, > BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); > > + /* > + * if stacks are successfully collected, cache them to task_storage, they are then > + * dumped if the off-cpu time hits the threshold. > + */ > + if (stack_id > 0) { > + stack_p = bpf_task_storage_get(&stack_cache, prev, NULL, > + BPF_LOCAL_STORAGE_GET_F_CREATE); Can you just add a new field in the tstamp map instead? > + if (stack_p) { > + /* to pass the clang result unused warning */ > + int __attribute__((unused)) len; > + len = bpf_get_stack(ctx, stack_p->array, MAX_STACKS * sizeof(u64), > + BPF_F_USER_STACK) / sizeof(u64); I think you can move to the next task if it failed to get the stack trace. > + } > + } > + > pelem = bpf_task_storage_get(&tstamp, prev, NULL, > BPF_LOCAL_STORAGE_GET_F_CREATE); > if (!pelem) > @@ -238,6 +342,14 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev, > else > bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); > > + if (delta >= offcpu_thresh) { > + struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero); > + > + stack_p = bpf_task_storage_get(&stack_cache, next, NULL, 0); > + if (data && stack_p) > + off_cpu_dump(ctx, data, &key, stack_p, delta, pelem->timestamp); > + } I think you should either dump the data directly or save it in the off_cpu map. Otherwise the time is accounted twice. Thanks, Namhyung > + > /* prevent to reuse the timestamp later */ > pelem->timestamp = 0; > } > diff --git a/tools/perf/util/off_cpu.h b/tools/perf/util/off_cpu.h > index 357231cb1c38..eaf7be92472d 100644 > --- a/tools/perf/util/off_cpu.h > +++ b/tools/perf/util/off_cpu.h > @@ -15,9 +15,15 @@ struct record_opts; > #define OFFCPU_SAMPLE_TYPES (PERF_SAMPLE_IDENTIFIER | PERF_SAMPLE_IP | \ > PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \ > PERF_SAMPLE_ID | PERF_SAMPLE_CPU | \ > - PERF_SAMPLE_PERIOD | PERF_SAMPLE_CALLCHAIN | \ > + PERF_SAMPLE_PERIOD | PERF_SAMPLE_RAW | \ > PERF_SAMPLE_CGROUP) > > +/* > + * for embedded data to overwrite the original sample, duplicated sample types > + * must be set in the original OFFCPU_SAMPLE_TYPES, except for callchain. > + */ > +#define OFFCPU_EMBEDDED_SAMPLE_TYPES (PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD | \ > + PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_CGROUP) > > #ifdef HAVE_BPF_SKEL > int off_cpu_prepare(struct evlist *evlist, struct target *target, > -- > 2.43.0 >
© 2016 - 2024 Red Hat, Inc.