[PATCH v6 5/6] perf tools: Merge deferred user callchains

Namhyung Kim posted 6 patches 2 months, 2 weeks ago
[PATCH v6 5/6] perf tools: Merge deferred user callchains
Posted by Namhyung Kim 2 months, 2 weeks ago
Save samples with deferred callchains in a separate list and deliver
them after merging the user callchains.  If users don't want to merge
they can set tool->merge_deferred_callchains to false to prevent the
behavior.

With previous result, now perf script will show the merged callchains.

  $ perf script
  ...
  pwd    2312   121.163435:     249113 cpu/cycles/P:
          ffffffff845b78d8 __build_id_parse.isra.0+0x218 ([kernel.kallsyms])
          ffffffff83bb5bf6 perf_event_mmap+0x2e6 ([kernel.kallsyms])
          ffffffff83c31959 mprotect_fixup+0x1e9 ([kernel.kallsyms])
          ffffffff83c31dc5 do_mprotect_pkey+0x2b5 ([kernel.kallsyms])
          ffffffff83c3206f __x64_sys_mprotect+0x1f ([kernel.kallsyms])
          ffffffff845e6692 do_syscall_64+0x62 ([kernel.kallsyms])
          ffffffff8360012f entry_SYSCALL_64_after_hwframe+0x76 ([kernel.kallsyms])
              7f18fe337fa7 mprotect+0x7 (/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2)
              7f18fe330e0f _dl_sysdep_start+0x7f (/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2)
              7f18fe331448 _dl_start_user+0x0 (/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2)
  ...

The old output can be get using --no-merge-callchain option.
Also perf report can get the user callchain entry at the end.

  $ perf report --no-children --stdio -q -S __build_id_parse.isra.0
  # symbol: __build_id_parse.isra.0
       8.40%  pwd      [kernel.kallsyms]
              |
              ---__build_id_parse.isra.0
                 perf_event_mmap
                 mprotect_fixup
                 do_mprotect_pkey
                 __x64_sys_mprotect
                 do_syscall_64
                 entry_SYSCALL_64_after_hwframe
                 mprotect
                 _dl_sysdep_start
                 _dl_start_user

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-script.txt |  5 ++
 tools/perf/builtin-inject.c              |  1 +
 tools/perf/builtin-report.c              |  1 +
 tools/perf/builtin-script.c              |  4 ++
 tools/perf/util/callchain.c              | 29 +++++++++
 tools/perf/util/callchain.h              |  3 +
 tools/perf/util/evlist.c                 |  1 +
 tools/perf/util/evlist.h                 |  2 +
 tools/perf/util/session.c                | 79 +++++++++++++++++++++++-
 tools/perf/util/tool.c                   |  2 +
 tools/perf/util/tool.h                   |  1 +
 11 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 28bec7e78bc858ba..03d1129606328d6d 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -527,6 +527,11 @@ include::itrace.txt[]
 	The known limitations include exception handing such as
 	setjmp/longjmp will have calls/returns not match.
 
+--merge-callchains::
+	Enable merging deferred user callchains if available.  This is the
+	default behavior.  If you want to see separate CALLCHAIN_DEFERRED
+	records for some reason, use --no-merge-callchains explicitly.
+
 :GMEXAMPLECMD: script
 :GMEXAMPLESUBCMD:
 include::guest-files.txt[]
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index bd9245d2dd41aa48..51d2721b6db9dccb 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -2527,6 +2527,7 @@ int cmd_inject(int argc, const char **argv)
 	inject.tool.auxtrace		= perf_event__repipe_auxtrace;
 	inject.tool.bpf_metadata	= perf_event__repipe_op2_synth;
 	inject.tool.dont_split_sample_group = true;
+	inject.tool.merge_deferred_callchains = false;
 	inject.session = __perf_session__new(&data, &inject.tool,
 					     /*trace_event_repipe=*/inject.output.is_pipe,
 					     /*host_env=*/NULL);
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 2bc269f5fcef8023..add6b1c2aaf04270 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1614,6 +1614,7 @@ int cmd_report(int argc, const char **argv)
 	report.tool.event_update	 = perf_event__process_event_update;
 	report.tool.feature		 = process_feature_event;
 	report.tool.ordering_requires_timestamps = true;
+	report.tool.merge_deferred_callchains = !dump_trace;
 
 	session = perf_session__new(&data, &report.tool);
 	if (IS_ERR(session)) {
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 85b42205a71b3993..62e43d3c5ad731a0 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -4009,6 +4009,7 @@ int cmd_script(int argc, const char **argv)
 	bool header_only = false;
 	bool script_started = false;
 	bool unsorted_dump = false;
+	bool merge_deferred_callchains = true;
 	char *rec_script_path = NULL;
 	char *rep_script_path = NULL;
 	struct perf_session *session;
@@ -4162,6 +4163,8 @@ int cmd_script(int argc, const char **argv)
 		    "Guest code can be found in hypervisor process"),
 	OPT_BOOLEAN('\0', "stitch-lbr", &script.stitch_lbr,
 		    "Enable LBR callgraph stitching approach"),
+	OPT_BOOLEAN('\0', "merge-callchains", &merge_deferred_callchains,
+		    "Enable merge deferred user callchains"),
 	OPTS_EVSWITCH(&script.evswitch),
 	OPT_END()
 	};
@@ -4418,6 +4421,7 @@ int cmd_script(int argc, const char **argv)
 	script.tool.throttle		 = process_throttle_event;
 	script.tool.unthrottle		 = process_throttle_event;
 	script.tool.ordering_requires_timestamps = true;
+	script.tool.merge_deferred_callchains = merge_deferred_callchains;
 	session = perf_session__new(&data, &script.tool);
 	if (IS_ERR(session))
 		return PTR_ERR(session);
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 2884187ccbbecfdc..71dc5a070065dd2a 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1838,3 +1838,32 @@ int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
 	}
 	return 0;
 }
+
+int sample__merge_deferred_callchain(struct perf_sample *sample_orig,
+				     struct perf_sample *sample_callchain)
+{
+	u64 nr_orig = sample_orig->callchain->nr - 1;
+	u64 nr_deferred = sample_callchain->callchain->nr;
+	struct ip_callchain *callchain;
+
+	if (sample_orig->callchain->nr < 2) {
+		sample_orig->deferred_callchain = false;
+		return -EINVAL;
+	}
+
+	callchain = calloc(1 + nr_orig + nr_deferred, sizeof(u64));
+	if (callchain == NULL) {
+		sample_orig->deferred_callchain = false;
+		return -ENOMEM;
+	}
+
+	callchain->nr = nr_orig + nr_deferred;
+	/* copy original including PERF_CONTEXT_USER_DEFERRED (but the cookie) */
+	memcpy(callchain->ips, sample_orig->callchain->ips, nr_orig * sizeof(u64));
+	/* copy deferred user callchains */
+	memcpy(&callchain->ips[nr_orig], sample_callchain->callchain->ips,
+	       nr_deferred * sizeof(u64));
+
+	sample_orig->callchain = callchain;
+	return 0;
+}
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index d5ae4fbb7ce5fa44..2a52af8c80ace33c 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -318,4 +318,7 @@ int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
 				    struct perf_sample *sample, int max_stack,
 				    bool symbols, callchain_iter_fn cb, void *data);
 
+int sample__merge_deferred_callchain(struct perf_sample *sample_orig,
+				     struct perf_sample *sample_callchain);
+
 #endif	/* __PERF_CALLCHAIN_H */
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index e8217efdda5323c6..03674d2cbd015e4f 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -85,6 +85,7 @@ void evlist__init(struct evlist *evlist, struct perf_cpu_map *cpus,
 	evlist->ctl_fd.pos = -1;
 	evlist->nr_br_cntr = -1;
 	metricgroup__rblist_init(&evlist->metric_events);
+	INIT_LIST_HEAD(&evlist->deferred_samples);
 }
 
 struct evlist *evlist__new(void)
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 5e71e3dc60423079..911834ae7c2a6f76 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -92,6 +92,8 @@ struct evlist {
 	 * of struct metric_expr.
 	 */
 	struct rblist	metric_events;
+	/* samples with deferred_callchain would wait here. */
+	struct list_head deferred_samples;
 };
 
 struct evsel_str_handler {
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 361e15c1f26a96d0..dc570ad47ccc2c63 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1285,6 +1285,66 @@ static int evlist__deliver_sample(struct evlist *evlist, const struct perf_tool
 					    per_thread);
 }
 
+/*
+ * Samples with deferred callchains should wait for the next matching
+ * PERF_RECORD_CALLCHAIN_RECORD entries.  Keep the events in a list and
+ * deliver them once it finds the callchains.
+ */
+struct deferred_event {
+	struct list_head list;
+	union perf_event *event;
+};
+
+static int evlist__deliver_deferred_callchain(struct evlist *evlist,
+					      const struct perf_tool *tool,
+					      union  perf_event *event,
+					      struct perf_sample *sample,
+					      struct machine *machine)
+{
+	struct deferred_event *de, *tmp;
+	struct evsel *evsel;
+	int ret = 0;
+
+	if (!tool->merge_deferred_callchains) {
+		evsel = evlist__id2evsel(evlist, sample->id);
+		return tool->callchain_deferred(tool, event, sample,
+						evsel, machine);
+	}
+
+	list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
+		struct perf_sample orig_sample;
+
+		ret = evlist__parse_sample(evlist, de->event, &orig_sample);
+		if (ret < 0) {
+			pr_err("failed to parse original sample\n");
+			break;
+		}
+
+		if (sample->tid != orig_sample.tid)
+			continue;
+
+		if (event->callchain_deferred.cookie == orig_sample.deferred_cookie)
+			sample__merge_deferred_callchain(&orig_sample, sample);
+		else
+			orig_sample.deferred_callchain = false;
+
+		evsel = evlist__id2evsel(evlist, orig_sample.id);
+		ret = evlist__deliver_sample(evlist, tool, de->event,
+					     &orig_sample, evsel, machine);
+
+		if (orig_sample.deferred_callchain)
+			free(orig_sample.callchain);
+
+		list_del(&de->list);
+		free(de->event);
+		free(de);
+
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
 static int machines__deliver_event(struct machines *machines,
 				   struct evlist *evlist,
 				   union perf_event *event,
@@ -1313,6 +1373,22 @@ static int machines__deliver_event(struct machines *machines,
 			return 0;
 		}
 		dump_sample(evsel, event, sample, perf_env__arch(machine->env));
+		if (sample->deferred_callchain && tool->merge_deferred_callchains) {
+			struct deferred_event *de = malloc(sizeof(*de));
+			size_t sz = event->header.size;
+
+			if (de == NULL)
+				return -ENOMEM;
+
+			de->event = malloc(sz);
+			if (de->event == NULL) {
+				free(de);
+				return -ENOMEM;
+			}
+			memcpy(de->event, event, sz);
+			list_add_tail(&de->list, &evlist->deferred_samples);
+			return 0;
+		}
 		return evlist__deliver_sample(evlist, tool, event, sample, evsel, machine);
 	case PERF_RECORD_MMAP:
 		return tool->mmap(tool, event, sample, machine);
@@ -1372,7 +1448,8 @@ static int machines__deliver_event(struct machines *machines,
 		return tool->aux_output_hw_id(tool, event, sample, machine);
 	case PERF_RECORD_CALLCHAIN_DEFERRED:
 		dump_deferred_callchain(evsel, event, sample);
-		return tool->callchain_deferred(tool, event, sample, evsel, machine);
+		return evlist__deliver_deferred_callchain(evlist, tool, event,
+							  sample, machine);
 	default:
 		++evlist->stats.nr_unknown_events;
 		return -1;
diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
index e77f0e2ecc1f79db..27ba5849c74a2e7d 100644
--- a/tools/perf/util/tool.c
+++ b/tools/perf/util/tool.c
@@ -266,6 +266,7 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events)
 	tool->cgroup_events = false;
 	tool->no_warn = false;
 	tool->show_feat_hdr = SHOW_FEAT_NO_HEADER;
+	tool->merge_deferred_callchains = true;
 
 	tool->sample = process_event_sample_stub;
 	tool->mmap = process_event_stub;
@@ -448,6 +449,7 @@ void delegate_tool__init(struct delegate_tool *tool, struct perf_tool *delegate)
 	tool->tool.cgroup_events = delegate->cgroup_events;
 	tool->tool.no_warn = delegate->no_warn;
 	tool->tool.show_feat_hdr = delegate->show_feat_hdr;
+	tool->tool.merge_deferred_callchains = delegate->merge_deferred_callchains;
 
 	tool->tool.sample = delegate_sample;
 	tool->tool.read = delegate_read;
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 9b9f0a8cbf3de4b5..e96b69d25a5b737d 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -90,6 +90,7 @@ struct perf_tool {
 	bool		cgroup_events;
 	bool		no_warn;
 	bool		dont_split_sample_group;
+	bool		merge_deferred_callchains;
 	enum show_feature_header show_feat_hdr;
 };
 
-- 
2.52.0.rc2.455.g230fcf2819-goog
Re: [PATCH v6 5/6] perf tools: Merge deferred user callchains
Posted by Jens Remus 1 month, 4 weeks ago
Hello Namhyung!

On 11/21/2025 12:48 AM, Namhyung Kim wrote:
> Save samples with deferred callchains in a separate list and deliver
> them after merging the user callchains.  If users don't want to merge
> they can set tool->merge_deferred_callchains to false to prevent the
> behavior.

> diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c

> +int sample__merge_deferred_callchain(struct perf_sample *sample_orig,
> +				     struct perf_sample *sample_callchain)
> +{
> +	u64 nr_orig = sample_orig->callchain->nr - 1;
> +	u64 nr_deferred = sample_callchain->callchain->nr;
> +	struct ip_callchain *callchain;
> +
> +	if (sample_orig->callchain->nr < 2) {
> +		sample_orig->deferred_callchain = false;
> +		return -EINVAL;
> +	}
> +
> +	callchain = calloc(1 + nr_orig + nr_deferred, sizeof(u64));
> +	if (callchain == NULL) {
> +		sample_orig->deferred_callchain = false;
> +		return -ENOMEM;
> +	}
> +
> +	callchain->nr = nr_orig + nr_deferred;
> +	/* copy original including PERF_CONTEXT_USER_DEFERRED (but the cookie) */
> +	memcpy(callchain->ips, sample_orig->callchain->ips, nr_orig * sizeof(u64));
> +	/* copy deferred user callchains */
> +	memcpy(&callchain->ips[nr_orig], sample_callchain->callchain->ips,
> +	       nr_deferred * sizeof(u64));
> +
> +	sample_orig->callchain = callchain;

Hope you don't mind my naive question, as I don't have a clue about perf:

Doesn't the sample_orig->callchain storage need to be free'd prior to
assigning the newly allocated one?  Or is that just part of a large
block that got allocated in one piece?  How is then the one allocated
here ever free'd?

> +	return 0;
> +}
Thanks and regards,
Jens
-- 
Jens Remus
Linux on Z Development (D3303)
+49-7031-16-1128 Office
jremus@de.ibm.com

IBM

IBM Deutschland Research & Development GmbH; Vorsitzender des Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294
IBM Data Privacy Statement: https://www.ibm.com/privacy/

Re: [PATCH v6 5/6] perf tools: Merge deferred user callchains
Posted by Jens Remus 1 month, 4 weeks ago
Hello Namhyung,

sorry for the fuss!

On 12/12/2025 12:16 PM, Jens Remus wrote:

> On 11/21/2025 12:48 AM, Namhyung Kim wrote:
>> Save samples with deferred callchains in a separate list and deliver
>> them after merging the user callchains.  If users don't want to merge
>> they can set tool->merge_deferred_callchains to false to prevent the
>> behavior.
> 
>> diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
> 
>> +int sample__merge_deferred_callchain(struct perf_sample *sample_orig,
>> +				     struct perf_sample *sample_callchain)
>> +{
>> +	u64 nr_orig = sample_orig->callchain->nr - 1;
>> +	u64 nr_deferred = sample_callchain->callchain->nr;
>> +	struct ip_callchain *callchain;
>> +
>> +	if (sample_orig->callchain->nr < 2) {
>> +		sample_orig->deferred_callchain = false;
>> +		return -EINVAL;
>> +	}
>> +
>> +	callchain = calloc(1 + nr_orig + nr_deferred, sizeof(u64));
>> +	if (callchain == NULL) {
>> +		sample_orig->deferred_callchain = false;
>> +		return -ENOMEM;
>> +	}
>> +
>> +	callchain->nr = nr_orig + nr_deferred;
>> +	/* copy original including PERF_CONTEXT_USER_DEFERRED (but the cookie) */
>> +	memcpy(callchain->ips, sample_orig->callchain->ips, nr_orig * sizeof(u64));
>> +	/* copy deferred user callchains */
>> +	memcpy(&callchain->ips[nr_orig], sample_callchain->callchain->ips,
>> +	       nr_deferred * sizeof(u64));
>> +
>> +	sample_orig->callchain = callchain;
> 
> Hope you don't mind my naive question, as I don't have a clue about perf:
> 
> Doesn't the sample_orig->callchain storage need to be free'd prior to
> assigning the newly allocated one?  Or is that just part of a large
> block that got allocated in one piece?  How is then the one allocated
> here ever free'd?

Never mind, I found that it is getting free'd in
evlist__deliver_deferred_callchain().

> 
>> +	return 0;
>> +}
> Thanks and regards,
> Jens

Regards,
Jens
-- 
Jens Remus
Linux on Z Development (D3303)
+49-7031-16-1128 Office
jremus@de.ibm.com

IBM

IBM Deutschland Research & Development GmbH; Vorsitzender des Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294
IBM Data Privacy Statement: https://www.ibm.com/privacy/

Re: [PATCH v6 5/6] perf tools: Merge deferred user callchains
Posted by Ian Rogers 2 months, 1 week ago
On Thu, Nov 20, 2025 at 3:48 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Save samples with deferred callchains in a separate list and deliver
> them after merging the user callchains.  If users don't want to merge
> they can set tool->merge_deferred_callchains to false to prevent the
> behavior.
>
> With previous result, now perf script will show the merged callchains.
>
>   $ perf script
>   ...
>   pwd    2312   121.163435:     249113 cpu/cycles/P:
>           ffffffff845b78d8 __build_id_parse.isra.0+0x218 ([kernel.kallsyms])
>           ffffffff83bb5bf6 perf_event_mmap+0x2e6 ([kernel.kallsyms])
>           ffffffff83c31959 mprotect_fixup+0x1e9 ([kernel.kallsyms])
>           ffffffff83c31dc5 do_mprotect_pkey+0x2b5 ([kernel.kallsyms])
>           ffffffff83c3206f __x64_sys_mprotect+0x1f ([kernel.kallsyms])
>           ffffffff845e6692 do_syscall_64+0x62 ([kernel.kallsyms])
>           ffffffff8360012f entry_SYSCALL_64_after_hwframe+0x76 ([kernel.kallsyms])
>               7f18fe337fa7 mprotect+0x7 (/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2)
>               7f18fe330e0f _dl_sysdep_start+0x7f (/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2)
>               7f18fe331448 _dl_start_user+0x0 (/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2)
>   ...
>
> The old output can be get using --no-merge-callchain option.
> Also perf report can get the user callchain entry at the end.
>
>   $ perf report --no-children --stdio -q -S __build_id_parse.isra.0
>   # symbol: __build_id_parse.isra.0
>        8.40%  pwd      [kernel.kallsyms]
>               |
>               ---__build_id_parse.isra.0
>                  perf_event_mmap
>                  mprotect_fixup
>                  do_mprotect_pkey
>                  __x64_sys_mprotect
>                  do_syscall_64
>                  entry_SYSCALL_64_after_hwframe
>                  mprotect
>                  _dl_sysdep_start
>                  _dl_start_user
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>

Reviewed-by: Ian Rogers <irogers@google.com>

> ---
>  tools/perf/Documentation/perf-script.txt |  5 ++
>  tools/perf/builtin-inject.c              |  1 +
>  tools/perf/builtin-report.c              |  1 +
>  tools/perf/builtin-script.c              |  4 ++
>  tools/perf/util/callchain.c              | 29 +++++++++
>  tools/perf/util/callchain.h              |  3 +
>  tools/perf/util/evlist.c                 |  1 +
>  tools/perf/util/evlist.h                 |  2 +
>  tools/perf/util/session.c                | 79 +++++++++++++++++++++++-
>  tools/perf/util/tool.c                   |  2 +
>  tools/perf/util/tool.h                   |  1 +
>  11 files changed, 127 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
> index 28bec7e78bc858ba..03d1129606328d6d 100644
> --- a/tools/perf/Documentation/perf-script.txt
> +++ b/tools/perf/Documentation/perf-script.txt
> @@ -527,6 +527,11 @@ include::itrace.txt[]
>         The known limitations include exception handing such as
>         setjmp/longjmp will have calls/returns not match.
>
> +--merge-callchains::
> +       Enable merging deferred user callchains if available.  This is the
> +       default behavior.  If you want to see separate CALLCHAIN_DEFERRED
> +       records for some reason, use --no-merge-callchains explicitly.
> +
>  :GMEXAMPLECMD: script
>  :GMEXAMPLESUBCMD:
>  include::guest-files.txt[]
> diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
> index bd9245d2dd41aa48..51d2721b6db9dccb 100644
> --- a/tools/perf/builtin-inject.c
> +++ b/tools/perf/builtin-inject.c
> @@ -2527,6 +2527,7 @@ int cmd_inject(int argc, const char **argv)
>         inject.tool.auxtrace            = perf_event__repipe_auxtrace;
>         inject.tool.bpf_metadata        = perf_event__repipe_op2_synth;
>         inject.tool.dont_split_sample_group = true;
> +       inject.tool.merge_deferred_callchains = false;
>         inject.session = __perf_session__new(&data, &inject.tool,
>                                              /*trace_event_repipe=*/inject.output.is_pipe,
>                                              /*host_env=*/NULL);
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index 2bc269f5fcef8023..add6b1c2aaf04270 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -1614,6 +1614,7 @@ int cmd_report(int argc, const char **argv)
>         report.tool.event_update         = perf_event__process_event_update;
>         report.tool.feature              = process_feature_event;
>         report.tool.ordering_requires_timestamps = true;
> +       report.tool.merge_deferred_callchains = !dump_trace;
>
>         session = perf_session__new(&data, &report.tool);
>         if (IS_ERR(session)) {
> diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
> index 85b42205a71b3993..62e43d3c5ad731a0 100644
> --- a/tools/perf/builtin-script.c
> +++ b/tools/perf/builtin-script.c
> @@ -4009,6 +4009,7 @@ int cmd_script(int argc, const char **argv)
>         bool header_only = false;
>         bool script_started = false;
>         bool unsorted_dump = false;
> +       bool merge_deferred_callchains = true;
>         char *rec_script_path = NULL;
>         char *rep_script_path = NULL;
>         struct perf_session *session;
> @@ -4162,6 +4163,8 @@ int cmd_script(int argc, const char **argv)
>                     "Guest code can be found in hypervisor process"),
>         OPT_BOOLEAN('\0', "stitch-lbr", &script.stitch_lbr,
>                     "Enable LBR callgraph stitching approach"),
> +       OPT_BOOLEAN('\0', "merge-callchains", &merge_deferred_callchains,
> +                   "Enable merge deferred user callchains"),
>         OPTS_EVSWITCH(&script.evswitch),
>         OPT_END()
>         };
> @@ -4418,6 +4421,7 @@ int cmd_script(int argc, const char **argv)
>         script.tool.throttle             = process_throttle_event;
>         script.tool.unthrottle           = process_throttle_event;
>         script.tool.ordering_requires_timestamps = true;
> +       script.tool.merge_deferred_callchains = merge_deferred_callchains;
>         session = perf_session__new(&data, &script.tool);
>         if (IS_ERR(session))
>                 return PTR_ERR(session);
> diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
> index 2884187ccbbecfdc..71dc5a070065dd2a 100644
> --- a/tools/perf/util/callchain.c
> +++ b/tools/perf/util/callchain.c
> @@ -1838,3 +1838,32 @@ int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
>         }
>         return 0;
>  }
> +
> +int sample__merge_deferred_callchain(struct perf_sample *sample_orig,

nit: We use the term deferred rather than original except in this
context. I think deferred is a little more intention revealing than
original. Perhaps add a comment capturing that the original sample is
the deferred kernel sample.

Thanks,
Ian

> +                                    struct perf_sample *sample_callchain)
> +{
> +       u64 nr_orig = sample_orig->callchain->nr - 1;
> +       u64 nr_deferred = sample_callchain->callchain->nr;
> +       struct ip_callchain *callchain;
> +
> +       if (sample_orig->callchain->nr < 2) {
> +               sample_orig->deferred_callchain = false;
> +               return -EINVAL;
> +       }
> +
> +       callchain = calloc(1 + nr_orig + nr_deferred, sizeof(u64));
> +       if (callchain == NULL) {
> +               sample_orig->deferred_callchain = false;
> +               return -ENOMEM;
> +       }
> +
> +       callchain->nr = nr_orig + nr_deferred;
> +       /* copy original including PERF_CONTEXT_USER_DEFERRED (but the cookie) */
> +       memcpy(callchain->ips, sample_orig->callchain->ips, nr_orig * sizeof(u64));
> +       /* copy deferred user callchains */
> +       memcpy(&callchain->ips[nr_orig], sample_callchain->callchain->ips,
> +              nr_deferred * sizeof(u64));
> +
> +       sample_orig->callchain = callchain;
> +       return 0;
> +}
> diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
> index d5ae4fbb7ce5fa44..2a52af8c80ace33c 100644
> --- a/tools/perf/util/callchain.h
> +++ b/tools/perf/util/callchain.h
> @@ -318,4 +318,7 @@ int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
>                                     struct perf_sample *sample, int max_stack,
>                                     bool symbols, callchain_iter_fn cb, void *data);
>
> +int sample__merge_deferred_callchain(struct perf_sample *sample_orig,
> +                                    struct perf_sample *sample_callchain);
> +
>  #endif /* __PERF_CALLCHAIN_H */
> diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
> index e8217efdda5323c6..03674d2cbd015e4f 100644
> --- a/tools/perf/util/evlist.c
> +++ b/tools/perf/util/evlist.c
> @@ -85,6 +85,7 @@ void evlist__init(struct evlist *evlist, struct perf_cpu_map *cpus,
>         evlist->ctl_fd.pos = -1;
>         evlist->nr_br_cntr = -1;
>         metricgroup__rblist_init(&evlist->metric_events);
> +       INIT_LIST_HEAD(&evlist->deferred_samples);
>  }
>
>  struct evlist *evlist__new(void)
> diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
> index 5e71e3dc60423079..911834ae7c2a6f76 100644
> --- a/tools/perf/util/evlist.h
> +++ b/tools/perf/util/evlist.h
> @@ -92,6 +92,8 @@ struct evlist {
>          * of struct metric_expr.
>          */
>         struct rblist   metric_events;
> +       /* samples with deferred_callchain would wait here. */
> +       struct list_head deferred_samples;
>  };
>
>  struct evsel_str_handler {
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 361e15c1f26a96d0..dc570ad47ccc2c63 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -1285,6 +1285,66 @@ static int evlist__deliver_sample(struct evlist *evlist, const struct perf_tool
>                                             per_thread);
>  }
>
> +/*
> + * Samples with deferred callchains should wait for the next matching
> + * PERF_RECORD_CALLCHAIN_RECORD entries.  Keep the events in a list and
> + * deliver them once it finds the callchains.
> + */
> +struct deferred_event {
> +       struct list_head list;
> +       union perf_event *event;
> +};
> +
> +static int evlist__deliver_deferred_callchain(struct evlist *evlist,
> +                                             const struct perf_tool *tool,
> +                                             union  perf_event *event,
> +                                             struct perf_sample *sample,
> +                                             struct machine *machine)
> +{
> +       struct deferred_event *de, *tmp;
> +       struct evsel *evsel;
> +       int ret = 0;
> +
> +       if (!tool->merge_deferred_callchains) {
> +               evsel = evlist__id2evsel(evlist, sample->id);
> +               return tool->callchain_deferred(tool, event, sample,
> +                                               evsel, machine);
> +       }
> +
> +       list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
> +               struct perf_sample orig_sample;
> +
> +               ret = evlist__parse_sample(evlist, de->event, &orig_sample);
> +               if (ret < 0) {
> +                       pr_err("failed to parse original sample\n");
> +                       break;
> +               }
> +
> +               if (sample->tid != orig_sample.tid)
> +                       continue;
> +
> +               if (event->callchain_deferred.cookie == orig_sample.deferred_cookie)
> +                       sample__merge_deferred_callchain(&orig_sample, sample);
> +               else
> +                       orig_sample.deferred_callchain = false;
> +
> +               evsel = evlist__id2evsel(evlist, orig_sample.id);
> +               ret = evlist__deliver_sample(evlist, tool, de->event,
> +                                            &orig_sample, evsel, machine);
> +
> +               if (orig_sample.deferred_callchain)
> +                       free(orig_sample.callchain);
> +
> +               list_del(&de->list);
> +               free(de->event);
> +               free(de);
> +
> +               if (ret)
> +                       break;
> +       }
> +       return ret;
> +}
> +
>  static int machines__deliver_event(struct machines *machines,
>                                    struct evlist *evlist,
>                                    union perf_event *event,
> @@ -1313,6 +1373,22 @@ static int machines__deliver_event(struct machines *machines,
>                         return 0;
>                 }
>                 dump_sample(evsel, event, sample, perf_env__arch(machine->env));
> +               if (sample->deferred_callchain && tool->merge_deferred_callchains) {
> +                       struct deferred_event *de = malloc(sizeof(*de));
> +                       size_t sz = event->header.size;
> +
> +                       if (de == NULL)
> +                               return -ENOMEM;
> +
> +                       de->event = malloc(sz);
> +                       if (de->event == NULL) {
> +                               free(de);
> +                               return -ENOMEM;
> +                       }
> +                       memcpy(de->event, event, sz);
> +                       list_add_tail(&de->list, &evlist->deferred_samples);
> +                       return 0;
> +               }
>                 return evlist__deliver_sample(evlist, tool, event, sample, evsel, machine);
>         case PERF_RECORD_MMAP:
>                 return tool->mmap(tool, event, sample, machine);
> @@ -1372,7 +1448,8 @@ static int machines__deliver_event(struct machines *machines,
>                 return tool->aux_output_hw_id(tool, event, sample, machine);
>         case PERF_RECORD_CALLCHAIN_DEFERRED:
>                 dump_deferred_callchain(evsel, event, sample);
> -               return tool->callchain_deferred(tool, event, sample, evsel, machine);
> +               return evlist__deliver_deferred_callchain(evlist, tool, event,
> +                                                         sample, machine);
>         default:
>                 ++evlist->stats.nr_unknown_events;
>                 return -1;
> diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
> index e77f0e2ecc1f79db..27ba5849c74a2e7d 100644
> --- a/tools/perf/util/tool.c
> +++ b/tools/perf/util/tool.c
> @@ -266,6 +266,7 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events)
>         tool->cgroup_events = false;
>         tool->no_warn = false;
>         tool->show_feat_hdr = SHOW_FEAT_NO_HEADER;
> +       tool->merge_deferred_callchains = true;
>
>         tool->sample = process_event_sample_stub;
>         tool->mmap = process_event_stub;
> @@ -448,6 +449,7 @@ void delegate_tool__init(struct delegate_tool *tool, struct perf_tool *delegate)
>         tool->tool.cgroup_events = delegate->cgroup_events;
>         tool->tool.no_warn = delegate->no_warn;
>         tool->tool.show_feat_hdr = delegate->show_feat_hdr;
> +       tool->tool.merge_deferred_callchains = delegate->merge_deferred_callchains;
>
>         tool->tool.sample = delegate_sample;
>         tool->tool.read = delegate_read;
> diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
> index 9b9f0a8cbf3de4b5..e96b69d25a5b737d 100644
> --- a/tools/perf/util/tool.h
> +++ b/tools/perf/util/tool.h
> @@ -90,6 +90,7 @@ struct perf_tool {
>         bool            cgroup_events;
>         bool            no_warn;
>         bool            dont_split_sample_group;
> +       bool            merge_deferred_callchains;
>         enum show_feature_header show_feat_hdr;
>  };
>
> --
> 2.52.0.rc2.455.g230fcf2819-goog
>
Re: [PATCH v6 5/6] perf tools: Merge deferred user callchains
Posted by Namhyung Kim 2 months, 1 week ago
On Tue, Dec 02, 2025 at 03:14:31PM -0800, Ian Rogers wrote:
> On Thu, Nov 20, 2025 at 3:48 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > Save samples with deferred callchains in a separate list and deliver
> > them after merging the user callchains.  If users don't want to merge
> > they can set tool->merge_deferred_callchains to false to prevent the
> > behavior.
> >
> > With previous result, now perf script will show the merged callchains.
> >
> >   $ perf script
> >   ...
> >   pwd    2312   121.163435:     249113 cpu/cycles/P:
> >           ffffffff845b78d8 __build_id_parse.isra.0+0x218 ([kernel.kallsyms])
> >           ffffffff83bb5bf6 perf_event_mmap+0x2e6 ([kernel.kallsyms])
> >           ffffffff83c31959 mprotect_fixup+0x1e9 ([kernel.kallsyms])
> >           ffffffff83c31dc5 do_mprotect_pkey+0x2b5 ([kernel.kallsyms])
> >           ffffffff83c3206f __x64_sys_mprotect+0x1f ([kernel.kallsyms])
> >           ffffffff845e6692 do_syscall_64+0x62 ([kernel.kallsyms])
> >           ffffffff8360012f entry_SYSCALL_64_after_hwframe+0x76 ([kernel.kallsyms])
> >               7f18fe337fa7 mprotect+0x7 (/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2)
> >               7f18fe330e0f _dl_sysdep_start+0x7f (/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2)
> >               7f18fe331448 _dl_start_user+0x0 (/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2)
> >   ...
> >
> > The old output can be get using --no-merge-callchain option.
> > Also perf report can get the user callchain entry at the end.
> >
> >   $ perf report --no-children --stdio -q -S __build_id_parse.isra.0
> >   # symbol: __build_id_parse.isra.0
> >        8.40%  pwd      [kernel.kallsyms]
> >               |
> >               ---__build_id_parse.isra.0
> >                  perf_event_mmap
> >                  mprotect_fixup
> >                  do_mprotect_pkey
> >                  __x64_sys_mprotect
> >                  do_syscall_64
> >                  entry_SYSCALL_64_after_hwframe
> >                  mprotect
> >                  _dl_sysdep_start
> >                  _dl_start_user
> >
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> 
> Reviewed-by: Ian Rogers <irogers@google.com>
> 
> > ---
> >  tools/perf/Documentation/perf-script.txt |  5 ++
> >  tools/perf/builtin-inject.c              |  1 +
> >  tools/perf/builtin-report.c              |  1 +
> >  tools/perf/builtin-script.c              |  4 ++
> >  tools/perf/util/callchain.c              | 29 +++++++++
> >  tools/perf/util/callchain.h              |  3 +
> >  tools/perf/util/evlist.c                 |  1 +
> >  tools/perf/util/evlist.h                 |  2 +
> >  tools/perf/util/session.c                | 79 +++++++++++++++++++++++-
> >  tools/perf/util/tool.c                   |  2 +
> >  tools/perf/util/tool.h                   |  1 +
> >  11 files changed, 127 insertions(+), 1 deletion(-)
> >
> > diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
> > index 28bec7e78bc858ba..03d1129606328d6d 100644
> > --- a/tools/perf/Documentation/perf-script.txt
> > +++ b/tools/perf/Documentation/perf-script.txt
> > @@ -527,6 +527,11 @@ include::itrace.txt[]
> >         The known limitations include exception handing such as
> >         setjmp/longjmp will have calls/returns not match.
> >
> > +--merge-callchains::
> > +       Enable merging deferred user callchains if available.  This is the
> > +       default behavior.  If you want to see separate CALLCHAIN_DEFERRED
> > +       records for some reason, use --no-merge-callchains explicitly.
> > +
> >  :GMEXAMPLECMD: script
> >  :GMEXAMPLESUBCMD:
> >  include::guest-files.txt[]
> > diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
> > index bd9245d2dd41aa48..51d2721b6db9dccb 100644
> > --- a/tools/perf/builtin-inject.c
> > +++ b/tools/perf/builtin-inject.c
> > @@ -2527,6 +2527,7 @@ int cmd_inject(int argc, const char **argv)
> >         inject.tool.auxtrace            = perf_event__repipe_auxtrace;
> >         inject.tool.bpf_metadata        = perf_event__repipe_op2_synth;
> >         inject.tool.dont_split_sample_group = true;
> > +       inject.tool.merge_deferred_callchains = false;
> >         inject.session = __perf_session__new(&data, &inject.tool,
> >                                              /*trace_event_repipe=*/inject.output.is_pipe,
> >                                              /*host_env=*/NULL);
> > diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> > index 2bc269f5fcef8023..add6b1c2aaf04270 100644
> > --- a/tools/perf/builtin-report.c
> > +++ b/tools/perf/builtin-report.c
> > @@ -1614,6 +1614,7 @@ int cmd_report(int argc, const char **argv)
> >         report.tool.event_update         = perf_event__process_event_update;
> >         report.tool.feature              = process_feature_event;
> >         report.tool.ordering_requires_timestamps = true;
> > +       report.tool.merge_deferred_callchains = !dump_trace;
> >
> >         session = perf_session__new(&data, &report.tool);
> >         if (IS_ERR(session)) {
> > diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
> > index 85b42205a71b3993..62e43d3c5ad731a0 100644
> > --- a/tools/perf/builtin-script.c
> > +++ b/tools/perf/builtin-script.c
> > @@ -4009,6 +4009,7 @@ int cmd_script(int argc, const char **argv)
> >         bool header_only = false;
> >         bool script_started = false;
> >         bool unsorted_dump = false;
> > +       bool merge_deferred_callchains = true;
> >         char *rec_script_path = NULL;
> >         char *rep_script_path = NULL;
> >         struct perf_session *session;
> > @@ -4162,6 +4163,8 @@ int cmd_script(int argc, const char **argv)
> >                     "Guest code can be found in hypervisor process"),
> >         OPT_BOOLEAN('\0', "stitch-lbr", &script.stitch_lbr,
> >                     "Enable LBR callgraph stitching approach"),
> > +       OPT_BOOLEAN('\0', "merge-callchains", &merge_deferred_callchains,
> > +                   "Enable merge deferred user callchains"),
> >         OPTS_EVSWITCH(&script.evswitch),
> >         OPT_END()
> >         };
> > @@ -4418,6 +4421,7 @@ int cmd_script(int argc, const char **argv)
> >         script.tool.throttle             = process_throttle_event;
> >         script.tool.unthrottle           = process_throttle_event;
> >         script.tool.ordering_requires_timestamps = true;
> > +       script.tool.merge_deferred_callchains = merge_deferred_callchains;
> >         session = perf_session__new(&data, &script.tool);
> >         if (IS_ERR(session))
> >                 return PTR_ERR(session);
> > diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
> > index 2884187ccbbecfdc..71dc5a070065dd2a 100644
> > --- a/tools/perf/util/callchain.c
> > +++ b/tools/perf/util/callchain.c
> > @@ -1838,3 +1838,32 @@ int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
> >         }
> >         return 0;
> >  }
> > +
> > +int sample__merge_deferred_callchain(struct perf_sample *sample_orig,
> 
> nit: We use the term deferred rather than original except in this
> context. I think deferred is a little more intention revealing than
> original. Perhaps add a comment capturing that the original sample is
> the deferred kernel sample.

Sure, will add.

Thanks,
Namhyung