[PATCH v5 6/6] perf tools: Flush remaining samples w/o deferred callchains

Namhyung Kim posted 6 patches 1 week, 4 days ago
There is a newer version of this series
[PATCH v5 6/6] perf tools: Flush remaining samples w/o deferred callchains
Posted by Namhyung Kim 1 week, 4 days ago
It's possible that some kernel samples don't have matching deferred
callchain records when the profiling session was ended before the
threads came back to userspace.  Let's flush the samples before
finish the session.

Also 32-bit systems can see partial mmap for the data.  In that case,
deferred samples won't point to the correct data once the mapping moves
to the next portion of the file.  Copy the original sample before it
unmaps the current data.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/session.c | 98 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 2e777fd1bcf6707b..b781e01ddcb4876b 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1288,8 +1288,13 @@ static int evlist__deliver_sample(struct evlist *evlist, const struct perf_tool
 struct deferred_event {
 	struct list_head list;
 	union perf_event *event;
+	bool allocated;
 };
 
+/*
+ * This is called when a deferred callchain record comes up.  Find all matching
+ * samples, merge the callchains and process them.
+ */
 static int evlist__deliver_deferred_samples(struct evlist *evlist,
 					    const struct perf_tool *tool,
 					    union  perf_event *event,
@@ -1331,6 +1336,86 @@ static int evlist__deliver_deferred_samples(struct evlist *evlist,
 			free(orig_sample.callchain);
 
 		list_del(&de->list);
+		if (de->allocated)
+			free(de->event);
+		free(de);
+
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+/*
+ * This is called when the backing mmap is about to go away.  It needs to save
+ * the original sample data until it finds the matching deferred callchains.
+ */
+static void evlist__copy_deferred_samples(struct evlist *evlist,
+					  const struct perf_tool *tool,
+					  struct machine *machine)
+{
+	struct deferred_event *de, *tmp;
+	struct evsel *evsel;
+	int ret = 0;
+
+	list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
+		struct perf_sample sample;
+		size_t sz = de->event->header.size;
+		void *buf;
+
+		if (de->allocated)
+			continue;
+
+		buf = malloc(sz);
+		if (buf) {
+			memcpy(buf, de->event, sz);
+			de->event = buf;
+			de->allocated = true;
+			continue;
+		}
+
+		/* The allocation failed, flush the sample now */
+		ret = evlist__parse_sample(evlist, de->event, &sample);
+		if (ret == 0) {
+			evsel = evlist__id2evsel(evlist, sample.id);
+			evlist__deliver_sample(evlist, tool, de->event,
+					       &sample, evsel, machine);
+		}
+
+		list_del(&de->list);
+		BUG_ON(de->allocated);
+		free(de);
+	}
+}
+
+/*
+ * This is called at the end of the data processing for the session.  Flush the
+ * remaining samples as there's no hope for matching deferred callchains.
+ */
+static int evlist__flush_deferred_samples(struct evlist *evlist,
+					  const struct perf_tool *tool,
+					  struct machine *machine)
+{
+	struct deferred_event *de, *tmp;
+	struct evsel *evsel;
+	int ret = 0;
+
+	list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
+		struct perf_sample sample;
+
+		ret = evlist__parse_sample(evlist, de->event, &sample);
+		if (ret < 0) {
+			pr_err("failed to parse original sample\n");
+			break;
+		}
+
+		evsel = evlist__id2evsel(evlist, sample.id);
+		ret = evlist__deliver_sample(evlist, tool, de->event,
+					     &sample, evsel, machine);
+
+		list_del(&de->list);
+		if (de->allocated)
+			free(de->event);
 		free(de);
 
 		if (ret)
@@ -1374,6 +1459,7 @@ static int machines__deliver_event(struct machines *machines,
 				return -ENOMEM;
 
 			de->event = event;
+			de->allocated = false;
 			list_add_tail(&de->list, &evlist->deferred_samples);
 			return 0;
 		}
@@ -2218,6 +2304,8 @@ reader__mmap(struct reader *rd, struct perf_session *session)
 	}
 
 	if (mmaps[rd->mmap_idx]) {
+		evlist__copy_deferred_samples(session->evlist, session->tool,
+					      &session->machines.host);
 		munmap(mmaps[rd->mmap_idx], rd->mmap_size);
 		mmaps[rd->mmap_idx] = NULL;
 	}
@@ -2372,6 +2460,11 @@ static int __perf_session__process_events(struct perf_session *session)
 	if (err)
 		goto out_err;
 	err = auxtrace__flush_events(session, tool);
+	if (err)
+		goto out_err;
+	err = evlist__flush_deferred_samples(session->evlist,
+					     session->tool,
+					     &session->machines.host);
 	if (err)
 		goto out_err;
 	err = perf_session__flush_thread_stacks(session);
@@ -2494,6 +2587,11 @@ static int __perf_session__process_dir_events(struct perf_session *session)
 	if (ret)
 		goto out_err;
 
+	ret = evlist__flush_deferred_samples(session->evlist, tool,
+					     &session->machines.host);
+	if (ret)
+		goto out_err;
+
 	ret = perf_session__flush_thread_stacks(session);
 out_err:
 	ui_progress__finish();
-- 
2.52.0.rc1.455.g30608eb744-goog
Re: [PATCH v5 6/6] perf tools: Flush remaining samples w/o deferred callchains
Posted by Ian Rogers 1 week, 4 days ago
On Wed, Nov 19, 2025 at 6:11 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> It's possible that some kernel samples don't have matching deferred
> callchain records when the profiling session was ended before the
> threads came back to userspace.  Let's flush the samples before
> finish the session.
>
> Also 32-bit systems can see partial mmap for the data.  In that case,
> deferred samples won't point to the correct data once the mapping moves
> to the next portion of the file.  Copy the original sample before it
> unmaps the current data.

I think it is simpler to always copy. We may have events from
synthesis, inject, .. and not the reader. Relying on callers to know
that someone made a copy of the event and to make a defensive copy on
their behalf just feels error prone.

In the python session API I need to deal with the lifetime of events.
Currently the events are copied:
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/python.c?h=perf-tools-next#n507
and I'm doing this for session tool callbacks:
https://lore.kernel.org/lkml/20251029053413.355154-12-irogers@google.com/
I think it can be made lazier by knowing the tool callback can assume
the event and sample are valid. We can delay the copying of the
event/sample for if the pyevent has a reference count >1 and we're
returning out of the tool callback. Doing some kind of global
knowledge in the reader for maintaining the correctness of memory, I'm
just not clear on how to make it always work.

Thanks,
Ian

> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  tools/perf/util/session.c | 98 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 98 insertions(+)
>
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 2e777fd1bcf6707b..b781e01ddcb4876b 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -1288,8 +1288,13 @@ static int evlist__deliver_sample(struct evlist *evlist, const struct perf_tool
>  struct deferred_event {
>         struct list_head list;
>         union perf_event *event;
> +       bool allocated;
>  };
>
> +/*
> + * This is called when a deferred callchain record comes up.  Find all matching
> + * samples, merge the callchains and process them.
> + */
>  static int evlist__deliver_deferred_samples(struct evlist *evlist,
>                                             const struct perf_tool *tool,
>                                             union  perf_event *event,
> @@ -1331,6 +1336,86 @@ static int evlist__deliver_deferred_samples(struct evlist *evlist,
>                         free(orig_sample.callchain);
>
>                 list_del(&de->list);
> +               if (de->allocated)
> +                       free(de->event);
> +               free(de);
> +
> +               if (ret)
> +                       break;
> +       }
> +       return ret;
> +}
> +
> +/*
> + * This is called when the backing mmap is about to go away.  It needs to save
> + * the original sample data until it finds the matching deferred callchains.
> + */
> +static void evlist__copy_deferred_samples(struct evlist *evlist,
> +                                         const struct perf_tool *tool,
> +                                         struct machine *machine)
> +{
> +       struct deferred_event *de, *tmp;
> +       struct evsel *evsel;
> +       int ret = 0;
> +
> +       list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
> +               struct perf_sample sample;
> +               size_t sz = de->event->header.size;
> +               void *buf;
> +
> +               if (de->allocated)
> +                       continue;
> +
> +               buf = malloc(sz);
> +               if (buf) {
> +                       memcpy(buf, de->event, sz);
> +                       de->event = buf;
> +                       de->allocated = true;
> +                       continue;
> +               }
> +
> +               /* The allocation failed, flush the sample now */
> +               ret = evlist__parse_sample(evlist, de->event, &sample);
> +               if (ret == 0) {
> +                       evsel = evlist__id2evsel(evlist, sample.id);
> +                       evlist__deliver_sample(evlist, tool, de->event,
> +                                              &sample, evsel, machine);
> +               }
> +
> +               list_del(&de->list);
> +               BUG_ON(de->allocated);
> +               free(de);
> +       }
> +}
> +
> +/*
> + * This is called at the end of the data processing for the session.  Flush the
> + * remaining samples as there's no hope for matching deferred callchains.
> + */
> +static int evlist__flush_deferred_samples(struct evlist *evlist,
> +                                         const struct perf_tool *tool,
> +                                         struct machine *machine)
> +{
> +       struct deferred_event *de, *tmp;
> +       struct evsel *evsel;
> +       int ret = 0;
> +
> +       list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
> +               struct perf_sample sample;
> +
> +               ret = evlist__parse_sample(evlist, de->event, &sample);
> +               if (ret < 0) {
> +                       pr_err("failed to parse original sample\n");
> +                       break;
> +               }
> +
> +               evsel = evlist__id2evsel(evlist, sample.id);
> +               ret = evlist__deliver_sample(evlist, tool, de->event,
> +                                            &sample, evsel, machine);
> +
> +               list_del(&de->list);
> +               if (de->allocated)
> +                       free(de->event);
>                 free(de);
>
>                 if (ret)
> @@ -1374,6 +1459,7 @@ static int machines__deliver_event(struct machines *machines,
>                                 return -ENOMEM;
>
>                         de->event = event;
> +                       de->allocated = false;
>                         list_add_tail(&de->list, &evlist->deferred_samples);
>                         return 0;
>                 }
> @@ -2218,6 +2304,8 @@ reader__mmap(struct reader *rd, struct perf_session *session)
>         }
>
>         if (mmaps[rd->mmap_idx]) {
> +               evlist__copy_deferred_samples(session->evlist, session->tool,
> +                                             &session->machines.host);
>                 munmap(mmaps[rd->mmap_idx], rd->mmap_size);
>                 mmaps[rd->mmap_idx] = NULL;
>         }
> @@ -2372,6 +2460,11 @@ static int __perf_session__process_events(struct perf_session *session)
>         if (err)
>                 goto out_err;
>         err = auxtrace__flush_events(session, tool);
> +       if (err)
> +               goto out_err;
> +       err = evlist__flush_deferred_samples(session->evlist,
> +                                            session->tool,
> +                                            &session->machines.host);
>         if (err)
>                 goto out_err;
>         err = perf_session__flush_thread_stacks(session);
> @@ -2494,6 +2587,11 @@ static int __perf_session__process_dir_events(struct perf_session *session)
>         if (ret)
>                 goto out_err;
>
> +       ret = evlist__flush_deferred_samples(session->evlist, tool,
> +                                            &session->machines.host);
> +       if (ret)
> +               goto out_err;
> +
>         ret = perf_session__flush_thread_stacks(session);
>  out_err:
>         ui_progress__finish();
> --
> 2.52.0.rc1.455.g30608eb744-goog
>
Re: [PATCH v5 6/6] perf tools: Flush remaining samples w/o deferred callchains
Posted by Ian Rogers 1 week, 4 days ago
On Wed, Nov 19, 2025 at 9:29 PM Ian Rogers <irogers@google.com> wrote:
>
> On Wed, Nov 19, 2025 at 6:11 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > It's possible that some kernel samples don't have matching deferred
> > callchain records when the profiling session was ended before the
> > threads came back to userspace.  Let's flush the samples before
> > finish the session.
> >
> > Also 32-bit systems can see partial mmap for the data.  In that case,
> > deferred samples won't point to the correct data once the mapping moves
> > to the next portion of the file.  Copy the original sample before it
> > unmaps the current data.
>
> I think it is simpler to always copy. We may have events from
> synthesis, inject, .. and not the reader. Relying on callers to know
> that someone made a copy of the event and to make a defensive copy on
> their behalf just feels error prone.
>
> In the python session API I need to deal with the lifetime of events.
> Currently the events are copied:
> https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/python.c?h=perf-tools-next#n507
> and I'm doing this for session tool callbacks:
> https://lore.kernel.org/lkml/20251029053413.355154-12-irogers@google.com/
> I think it can be made lazier by knowing the tool callback can assume
> the event and sample are valid. We can delay the copying of the
> event/sample for if the pyevent has a reference count >1 and we're
> returning out of the tool callback. Doing some kind of global
> knowledge in the reader for maintaining the correctness of memory, I'm
> just not clear on how to make it always work.

I believe we always reuse the memory for the event, per event, in pipe mode:
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/session.c?h=perf-tools-next#n1868
so a lazy copy will be broken for the pipe mode case.

Thanks,
Ian

> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> >  tools/perf/util/session.c | 98 +++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 98 insertions(+)
> >
> > diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> > index 2e777fd1bcf6707b..b781e01ddcb4876b 100644
> > --- a/tools/perf/util/session.c
> > +++ b/tools/perf/util/session.c
> > @@ -1288,8 +1288,13 @@ static int evlist__deliver_sample(struct evlist *evlist, const struct perf_tool
> >  struct deferred_event {
> >         struct list_head list;
> >         union perf_event *event;
> > +       bool allocated;
> >  };
> >
> > +/*
> > + * This is called when a deferred callchain record comes up.  Find all matching
> > + * samples, merge the callchains and process them.
> > + */
> >  static int evlist__deliver_deferred_samples(struct evlist *evlist,
> >                                             const struct perf_tool *tool,
> >                                             union  perf_event *event,
> > @@ -1331,6 +1336,86 @@ static int evlist__deliver_deferred_samples(struct evlist *evlist,
> >                         free(orig_sample.callchain);
> >
> >                 list_del(&de->list);
> > +               if (de->allocated)
> > +                       free(de->event);
> > +               free(de);
> > +
> > +               if (ret)
> > +                       break;
> > +       }
> > +       return ret;
> > +}
> > +
> > +/*
> > + * This is called when the backing mmap is about to go away.  It needs to save
> > + * the original sample data until it finds the matching deferred callchains.
> > + */
> > +static void evlist__copy_deferred_samples(struct evlist *evlist,
> > +                                         const struct perf_tool *tool,
> > +                                         struct machine *machine)
> > +{
> > +       struct deferred_event *de, *tmp;
> > +       struct evsel *evsel;
> > +       int ret = 0;
> > +
> > +       list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
> > +               struct perf_sample sample;
> > +               size_t sz = de->event->header.size;
> > +               void *buf;
> > +
> > +               if (de->allocated)
> > +                       continue;
> > +
> > +               buf = malloc(sz);
> > +               if (buf) {
> > +                       memcpy(buf, de->event, sz);
> > +                       de->event = buf;
> > +                       de->allocated = true;
> > +                       continue;
> > +               }
> > +
> > +               /* The allocation failed, flush the sample now */
> > +               ret = evlist__parse_sample(evlist, de->event, &sample);
> > +               if (ret == 0) {
> > +                       evsel = evlist__id2evsel(evlist, sample.id);
> > +                       evlist__deliver_sample(evlist, tool, de->event,
> > +                                              &sample, evsel, machine);
> > +               }
> > +
> > +               list_del(&de->list);
> > +               BUG_ON(de->allocated);
> > +               free(de);
> > +       }
> > +}
> > +
> > +/*
> > + * This is called at the end of the data processing for the session.  Flush the
> > + * remaining samples as there's no hope for matching deferred callchains.
> > + */
> > +static int evlist__flush_deferred_samples(struct evlist *evlist,
> > +                                         const struct perf_tool *tool,
> > +                                         struct machine *machine)
> > +{
> > +       struct deferred_event *de, *tmp;
> > +       struct evsel *evsel;
> > +       int ret = 0;
> > +
> > +       list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
> > +               struct perf_sample sample;
> > +
> > +               ret = evlist__parse_sample(evlist, de->event, &sample);
> > +               if (ret < 0) {
> > +                       pr_err("failed to parse original sample\n");
> > +                       break;
> > +               }
> > +
> > +               evsel = evlist__id2evsel(evlist, sample.id);
> > +               ret = evlist__deliver_sample(evlist, tool, de->event,
> > +                                            &sample, evsel, machine);
> > +
> > +               list_del(&de->list);
> > +               if (de->allocated)
> > +                       free(de->event);
> >                 free(de);
> >
> >                 if (ret)
> > @@ -1374,6 +1459,7 @@ static int machines__deliver_event(struct machines *machines,
> >                                 return -ENOMEM;
> >
> >                         de->event = event;
> > +                       de->allocated = false;
> >                         list_add_tail(&de->list, &evlist->deferred_samples);
> >                         return 0;
> >                 }
> > @@ -2218,6 +2304,8 @@ reader__mmap(struct reader *rd, struct perf_session *session)
> >         }
> >
> >         if (mmaps[rd->mmap_idx]) {
> > +               evlist__copy_deferred_samples(session->evlist, session->tool,
> > +                                             &session->machines.host);
> >                 munmap(mmaps[rd->mmap_idx], rd->mmap_size);
> >                 mmaps[rd->mmap_idx] = NULL;
> >         }
> > @@ -2372,6 +2460,11 @@ static int __perf_session__process_events(struct perf_session *session)
> >         if (err)
> >                 goto out_err;
> >         err = auxtrace__flush_events(session, tool);
> > +       if (err)
> > +               goto out_err;
> > +       err = evlist__flush_deferred_samples(session->evlist,
> > +                                            session->tool,
> > +                                            &session->machines.host);
> >         if (err)
> >                 goto out_err;
> >         err = perf_session__flush_thread_stacks(session);
> > @@ -2494,6 +2587,11 @@ static int __perf_session__process_dir_events(struct perf_session *session)
> >         if (ret)
> >                 goto out_err;
> >
> > +       ret = evlist__flush_deferred_samples(session->evlist, tool,
> > +                                            &session->machines.host);
> > +       if (ret)
> > +               goto out_err;
> > +
> >         ret = perf_session__flush_thread_stacks(session);
> >  out_err:
> >         ui_progress__finish();
> > --
> > 2.52.0.rc1.455.g30608eb744-goog
> >