[PATCH v7 03/10] perf record --off-cpu: Parse off-cpu event

Howard Chu posted 10 patches 2 weeks, 1 day ago
There is a newer version of this series
[PATCH v7 03/10] perf record --off-cpu: Parse off-cpu event
Posted by Howard Chu 2 weeks, 1 day ago
Parse the off-cpu event using parse_event, as bpf-output.

no-inherit is should be set to 1, here's the reason:

We update the BPF perf_event map for direct off-cpu sample dumping (in
following patches), it executes as follows:

bpf_map_update_value()
 bpf_fd_array_map_update_elem()
  perf_event_fd_array_get_ptr()
   perf_event_read_local()

In perf_event_read_local(), there is:

int perf_event_read_local(struct perf_event *event, u64 *value,
			  u64 *enabled, u64 *running)
{
...
	/*
	 * It must not be an event with inherit set, we cannot read
	 * all child counters from atomic context.
	 */
	if (event->attr.inherit) {
		ret = -EOPNOTSUPP;
		goto out;
	}

Which means no-inherit has to be true for updating the BPF perf_event
map.

Moreover, for bpf-output events, we primarily want a system-wide event
instead of a per-task event.

The reason is that in BPF's bpf_perf_event_output(), BPF uses the CPU
index to retrieve the perf_event file descriptor it outputs to.

Making a bpf-output event system-wide naturally satisfies this
requirement by mapping CPU appropriately.

Suggested-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Howard Chu <howardchu95@gmail.com>
---
 tools/perf/util/bpf_off_cpu.c | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c
index a590a8ac1f9d..558c5e5c2dc3 100644
--- a/tools/perf/util/bpf_off_cpu.c
+++ b/tools/perf/util/bpf_off_cpu.c
@@ -38,32 +38,21 @@ union off_cpu_data {
 
 static int off_cpu_config(struct evlist *evlist)
 {
+	char off_cpu_event[64];
 	struct evsel *evsel;
-	struct perf_event_attr attr = {
-		.type	= PERF_TYPE_SOFTWARE,
-		.config = PERF_COUNT_SW_BPF_OUTPUT,
-		.size	= sizeof(attr), /* to capture ABI version */
-	};
-	char *evname = strdup(OFFCPU_EVENT);
-
-	if (evname == NULL)
-		return -ENOMEM;
 
-	evsel = evsel__new(&attr);
-	if (!evsel) {
-		free(evname);
-		return -ENOMEM;
+	scnprintf(off_cpu_event, sizeof(off_cpu_event), "bpf-output/no-inherit=1,name=%s/", OFFCPU_EVENT);
+	if (parse_event(evlist, off_cpu_event)) {
+		pr_err("Failed to open off-cpu event\n");
+		return -1;
 	}
 
-	evsel->core.attr.freq = 1;
-	evsel->core.attr.sample_period = 1;
-	/* off-cpu analysis depends on stack trace */
-	evsel->core.attr.sample_type = PERF_SAMPLE_CALLCHAIN;
-
-	evlist__add(evlist, evsel);
-
-	free(evsel->name);
-	evsel->name = evname;
+	evlist__for_each_entry(evlist, evsel) {
+		if (evsel__is_offcpu_event(evsel)) {
+			evsel->core.system_wide = true;
+			break;
+		}
+	}
 
 	return 0;
 }
-- 
2.43.0
Re: [PATCH v7 03/10] perf record --off-cpu: Parse off-cpu event
Posted by Ian Rogers 1 week, 5 days ago
On Fri, Nov 8, 2024 at 12:41 PM Howard Chu <howardchu95@gmail.com> wrote:
>
> Parse the off-cpu event using parse_event, as bpf-output.
>
> no-inherit is should be set to 1, here's the reason:

nit: s/is should be/should be/

> We update the BPF perf_event map for direct off-cpu sample dumping (in
> following patches), it executes as follows:
>
> bpf_map_update_value()
>  bpf_fd_array_map_update_elem()
>   perf_event_fd_array_get_ptr()
>    perf_event_read_local()
>
> In perf_event_read_local(), there is:
>
> int perf_event_read_local(struct perf_event *event, u64 *value,
>                           u64 *enabled, u64 *running)
> {
> ...
>         /*
>          * It must not be an event with inherit set, we cannot read
>          * all child counters from atomic context.
>          */
>         if (event->attr.inherit) {
>                 ret = -EOPNOTSUPP;
>                 goto out;
>         }
>
> Which means no-inherit has to be true for updating the BPF perf_event
> map.
>
> Moreover, for bpf-output events, we primarily want a system-wide event
> instead of a per-task event.
>
> The reason is that in BPF's bpf_perf_event_output(), BPF uses the CPU
> index to retrieve the perf_event file descriptor it outputs to.
>
> Making a bpf-output event system-wide naturally satisfies this
> requirement by mapping CPU appropriately.
>
> Suggested-by: Namhyung Kim <namhyung@kernel.org>
> Signed-off-by: Howard Chu <howardchu95@gmail.com>
> ---
>  tools/perf/util/bpf_off_cpu.c | 33 +++++++++++----------------------
>  1 file changed, 11 insertions(+), 22 deletions(-)
>
> diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c
> index a590a8ac1f9d..558c5e5c2dc3 100644
> --- a/tools/perf/util/bpf_off_cpu.c
> +++ b/tools/perf/util/bpf_off_cpu.c
> @@ -38,32 +38,21 @@ union off_cpu_data {
>
>  static int off_cpu_config(struct evlist *evlist)
>  {
> +       char off_cpu_event[64];
>         struct evsel *evsel;
> -       struct perf_event_attr attr = {
> -               .type   = PERF_TYPE_SOFTWARE,
> -               .config = PERF_COUNT_SW_BPF_OUTPUT,
> -               .size   = sizeof(attr), /* to capture ABI version */
> -       };
> -       char *evname = strdup(OFFCPU_EVENT);
> -
> -       if (evname == NULL)
> -               return -ENOMEM;
>
> -       evsel = evsel__new(&attr);
> -       if (!evsel) {
> -               free(evname);
> -               return -ENOMEM;
> +       scnprintf(off_cpu_event, sizeof(off_cpu_event), "bpf-output/no-inherit=1,name=%s/", OFFCPU_EVENT);
> +       if (parse_event(evlist, off_cpu_event)) {
> +               pr_err("Failed to open off-cpu event\n");
> +               return -1;

Woot, love this! Much happier to see parse_events being used than hand
crafted attributes. This will help us keep things synchronized via
event parsing.

Reviewed-by: Ian Rogers <irogers@google.com>

Thanks,
Ian

>         }
>
> -       evsel->core.attr.freq = 1;
> -       evsel->core.attr.sample_period = 1;
> -       /* off-cpu analysis depends on stack trace */
> -       evsel->core.attr.sample_type = PERF_SAMPLE_CALLCHAIN;
> -
> -       evlist__add(evlist, evsel);
> -
> -       free(evsel->name);
> -       evsel->name = evname;
> +       evlist__for_each_entry(evlist, evsel) {
> +               if (evsel__is_offcpu_event(evsel)) {
> +                       evsel->core.system_wide = true;
> +                       break;
> +               }
> +       }
>
>         return 0;
>  }
> --
> 2.43.0
>
Re: [PATCH v7 03/10] perf record --off-cpu: Parse off-cpu event
Posted by Howard Chu 1 week, 5 days ago
Hello,

On Mon, Nov 11, 2024 at 9:45 AM Ian Rogers <irogers@google.com> wrote:
>
> On Fri, Nov 8, 2024 at 12:41 PM Howard Chu <howardchu95@gmail.com> wrote:
> >
> > Parse the off-cpu event using parse_event, as bpf-output.
> >
> > no-inherit is should be set to 1, here's the reason:
>
> nit: s/is should be/should be/

Interesting sed substitution format.

Good catch! I will fix that in a next commit, thanks!

>
> > We update the BPF perf_event map for direct off-cpu sample dumping (in
> > following patches), it executes as follows:
> >
> > bpf_map_update_value()
> >  bpf_fd_array_map_update_elem()
> >   perf_event_fd_array_get_ptr()
> >    perf_event_read_local()
> >
> > In perf_event_read_local(), there is:
> >
> > int perf_event_read_local(struct perf_event *event, u64 *value,
> >                           u64 *enabled, u64 *running)
> > {
> > ...
> >         /*
> >          * It must not be an event with inherit set, we cannot read
> >          * all child counters from atomic context.
> >          */
> >         if (event->attr.inherit) {
> >                 ret = -EOPNOTSUPP;
> >                 goto out;
> >         }
> >
> > Which means no-inherit has to be true for updating the BPF perf_event
> > map.
> >
> > Moreover, for bpf-output events, we primarily want a system-wide event
> > instead of a per-task event.
> >
> > The reason is that in BPF's bpf_perf_event_output(), BPF uses the CPU
> > index to retrieve the perf_event file descriptor it outputs to.
> >
> > Making a bpf-output event system-wide naturally satisfies this
> > requirement by mapping CPU appropriately.
> >
> > Suggested-by: Namhyung Kim <namhyung@kernel.org>
> > Signed-off-by: Howard Chu <howardchu95@gmail.com>
> > ---
> >  tools/perf/util/bpf_off_cpu.c | 33 +++++++++++----------------------
> >  1 file changed, 11 insertions(+), 22 deletions(-)
> >
> > diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c
> > index a590a8ac1f9d..558c5e5c2dc3 100644
> > --- a/tools/perf/util/bpf_off_cpu.c
> > +++ b/tools/perf/util/bpf_off_cpu.c
> > @@ -38,32 +38,21 @@ union off_cpu_data {
> >
> >  static int off_cpu_config(struct evlist *evlist)
> >  {
> > +       char off_cpu_event[64];
> >         struct evsel *evsel;
> > -       struct perf_event_attr attr = {
> > -               .type   = PERF_TYPE_SOFTWARE,
> > -               .config = PERF_COUNT_SW_BPF_OUTPUT,
> > -               .size   = sizeof(attr), /* to capture ABI version */
> > -       };
> > -       char *evname = strdup(OFFCPU_EVENT);
> > -
> > -       if (evname == NULL)
> > -               return -ENOMEM;
> >
> > -       evsel = evsel__new(&attr);
> > -       if (!evsel) {
> > -               free(evname);
> > -               return -ENOMEM;
> > +       scnprintf(off_cpu_event, sizeof(off_cpu_event), "bpf-output/no-inherit=1,name=%s/", OFFCPU_EVENT);
> > +       if (parse_event(evlist, off_cpu_event)) {
> > +               pr_err("Failed to open off-cpu event\n");
> > +               return -1;
>
> Woot, love this! Much happier to see parse_events being used than hand
> crafted attributes. This will help us keep things synchronized via
> event parsing.
>
> Reviewed-by: Ian Rogers <irogers@google.com>
>
> Thanks,
> Ian
>
> >         }
> >
> > -       evsel->core.attr.freq = 1;
> > -       evsel->core.attr.sample_period = 1;
> > -       /* off-cpu analysis depends on stack trace */
> > -       evsel->core.attr.sample_type = PERF_SAMPLE_CALLCHAIN;
> > -
> > -       evlist__add(evlist, evsel);
> > -
> > -       free(evsel->name);
> > -       evsel->name = evname;
> > +       evlist__for_each_entry(evlist, evsel) {
> > +               if (evsel__is_offcpu_event(evsel)) {
> > +                       evsel->core.system_wide = true;
> > +                       break;
> > +               }
> > +       }
> >
> >         return 0;
> >  }
> > --
> > 2.43.0
> >

Thanks,
Howard