[PATCH v12 08/11] perf tools: Minimal CALLCHAIN_DEFERRED support

Steven Rostedt posted 11 patches 3 months, 1 week ago
There is a newer version of this series
[PATCH v12 08/11] perf tools: Minimal CALLCHAIN_DEFERRED support
Posted by Steven Rostedt 3 months, 1 week ago
From: Namhyung Kim <namhyung@kernel.org>

Add a new event type for deferred callchains and a new callback for the
struct perf_tool.  For now it doesn't actually handle the deferred
callchains but it just marks the sample if it has the PERF_CONTEXT_
USER_DEFFERED in the callchain array.

At least, perf report can dump the raw data with this change.  Actually
this requires the next commit to enable attr.defer_callchain, but if you
already have a data file, it'll show the following result.

  $ perf report -D
  ...
  0x5fe0@perf.data [0x40]: event: 22
  .
  . ... raw event: size 64 bytes
  .  0000:  16 00 00 00 02 00 40 00 02 00 00 00 00 00 00 00  ......@.........
  .  0010:  00 fe ff ff ff ff ff ff 4b d3 3f 25 45 7f 00 00  ........K.?%E...
  .  0020:  21 03 00 00 21 03 00 00 43 02 12 ab 05 00 00 00  !...!...C.......
  .  0030:  00 00 00 00 00 00 00 00 09 00 00 00 00 00 00 00  ................

  0 24344920643 0x5fe0 [0x40]: PERF_RECORD_CALLCHAIN_DEFERRED(IP, 0x2): 801/801: 0
  ... FP chain: nr:2
  .....  0: fffffffffffffe00
  .....  1: 00007f45253fd34b
  : unhandled!

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/lib/perf/include/perf/event.h       |  7 +++++++
 tools/perf/util/event.c                   |  1 +
 tools/perf/util/evsel.c                   | 15 +++++++++++++++
 tools/perf/util/machine.c                 |  1 +
 tools/perf/util/perf_event_attr_fprintf.c |  1 +
 tools/perf/util/sample.h                  |  3 ++-
 tools/perf/util/session.c                 | 17 +++++++++++++++++
 tools/perf/util/tool.c                    |  1 +
 tools/perf/util/tool.h                    |  3 ++-
 9 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h
index 09b7c643ddac..4590d62ca913 100644
--- a/tools/lib/perf/include/perf/event.h
+++ b/tools/lib/perf/include/perf/event.h
@@ -151,6 +151,12 @@ struct perf_record_switch {
 	__u32			 next_prev_tid;
 };
 
+struct perf_record_callchain_deferred {
+	struct perf_event_header header;
+	__u64			 nr;
+	__u64			 ips[];
+};
+
 struct perf_record_header_attr {
 	struct perf_event_header header;
 	struct perf_event_attr	 attr;
@@ -505,6 +511,7 @@ union perf_event {
 	struct perf_record_read			read;
 	struct perf_record_throttle		throttle;
 	struct perf_record_sample		sample;
+	struct perf_record_callchain_deferred	callchain_deferred;
 	struct perf_record_bpf_event		bpf;
 	struct perf_record_ksymbol		ksymbol;
 	struct perf_record_text_poke_event	text_poke;
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 7544a3104e21..720682fea9be 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -58,6 +58,7 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_CGROUP]			= "CGROUP",
 	[PERF_RECORD_TEXT_POKE]			= "TEXT_POKE",
 	[PERF_RECORD_AUX_OUTPUT_HW_ID]		= "AUX_OUTPUT_HW_ID",
+	[PERF_RECORD_CALLCHAIN_DEFERRED]	= "CALLCHAIN_DEFERRED",
 	[PERF_RECORD_HEADER_ATTR]		= "ATTR",
 	[PERF_RECORD_HEADER_EVENT_TYPE]		= "EVENT_TYPE",
 	[PERF_RECORD_HEADER_TRACING_DATA]	= "TRACING_DATA",
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index d55482f094bf..6176c31b57ea 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2978,6 +2978,18 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 	data->data_src = PERF_MEM_DATA_SRC_NONE;
 	data->vcpu = -1;
 
+	if (event->header.type == PERF_RECORD_CALLCHAIN_DEFERRED) {
+		const u64 max_callchain_nr = UINT64_MAX / sizeof(u64);
+
+		data->callchain = (struct ip_callchain *)&event->callchain_deferred.nr;
+		if (data->callchain->nr > max_callchain_nr)
+			return -EFAULT;
+
+		if (evsel->core.attr.sample_id_all)
+			perf_evsel__parse_id_sample(evsel, event, data);
+		return 0;
+	}
+
 	if (event->header.type != PERF_RECORD_SAMPLE) {
 		if (!evsel->core.attr.sample_id_all)
 			return 0;
@@ -3108,6 +3120,9 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 		if (data->callchain->nr > max_callchain_nr)
 			return -EFAULT;
 		sz = data->callchain->nr * sizeof(u64);
+		if (evsel->core.attr.defer_callchain && data->callchain->nr >= 1 &&
+		    data->callchain->ips[data->callchain->nr - 1] == PERF_CONTEXT_USER_DEFERRED)
+			data->deferred_callchain = true;
 		OVERFLOW_CHECK(array, sz, max_size);
 		array = (void *)array + sz;
 	}
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 7ec12c207970..de6100366eee 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2124,6 +2124,7 @@ static int add_callchain_ip(struct thread *thread,
 				*cpumode = PERF_RECORD_MISC_KERNEL;
 				break;
 			case PERF_CONTEXT_USER:
+			case PERF_CONTEXT_USER_DEFERRED:
 				*cpumode = PERF_RECORD_MISC_USER;
 				break;
 			default:
diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c
index 66b666d9ce64..abfd9b9a718c 100644
--- a/tools/perf/util/perf_event_attr_fprintf.c
+++ b/tools/perf/util/perf_event_attr_fprintf.c
@@ -343,6 +343,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
 	PRINT_ATTRf(inherit_thread, p_unsigned);
 	PRINT_ATTRf(remove_on_exec, p_unsigned);
 	PRINT_ATTRf(sigtrap, p_unsigned);
+	PRINT_ATTRf(defer_callchain, p_unsigned);
 
 	PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned, false);
 	PRINT_ATTRf(bp_type, p_unsigned);
diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
index 0e96240052e9..9d6e2f14551c 100644
--- a/tools/perf/util/sample.h
+++ b/tools/perf/util/sample.h
@@ -108,7 +108,8 @@ struct perf_sample {
 		u16 p_stage_cyc;
 		u16 retire_lat;
 	};
-	bool no_hw_idx;		/* No hw_idx collected in branch_stack */
+	bool no_hw_idx;			/* No hw_idx collected in branch_stack */
+	bool deferred_callchain;	/* Has deferred user callchains */
 	char insn[MAX_INSN];
 	void *raw_data;
 	struct ip_callchain *callchain;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index a320672c264e..e6be01459352 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -715,6 +715,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
 	[PERF_RECORD_CGROUP]		  = perf_event__cgroup_swap,
 	[PERF_RECORD_TEXT_POKE]		  = perf_event__text_poke_swap,
 	[PERF_RECORD_AUX_OUTPUT_HW_ID]	  = perf_event__all64_swap,
+	[PERF_RECORD_CALLCHAIN_DEFERRED]  = perf_event__all64_swap,
 	[PERF_RECORD_HEADER_ATTR]	  = perf_event__hdr_attr_swap,
 	[PERF_RECORD_HEADER_EVENT_TYPE]	  = perf_event__event_type_swap,
 	[PERF_RECORD_HEADER_TRACING_DATA] = perf_event__tracing_data_swap,
@@ -1118,6 +1119,19 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
 		sample_read__printf(sample, evsel->core.attr.read_format);
 }
 
+static void dump_deferred_callchain(struct evsel *evsel, union perf_event *event,
+				    struct perf_sample *sample)
+{
+	if (!dump_trace)
+		return;
+
+	printf("(IP, 0x%x): %d/%d: %#" PRIx64 "\n",
+	       event->header.misc, sample->pid, sample->tid, sample->ip);
+
+	if (evsel__has_callchain(evsel))
+		callchain__printf(evsel, sample);
+}
+
 static void dump_read(struct evsel *evsel, union perf_event *event)
 {
 	struct perf_record_read *read_event = &event->read;
@@ -1348,6 +1362,9 @@ static int machines__deliver_event(struct machines *machines,
 		return tool->text_poke(tool, event, sample, machine);
 	case PERF_RECORD_AUX_OUTPUT_HW_ID:
 		return tool->aux_output_hw_id(tool, event, sample, machine);
+	case PERF_RECORD_CALLCHAIN_DEFERRED:
+		dump_deferred_callchain(evsel, event, sample);
+		return tool->callchain_deferred(tool, event, sample, evsel, machine);
 	default:
 		++evlist->stats.nr_unknown_events;
 		return -1;
diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
index 37bd8ac63b01..e1d60abb4e41 100644
--- a/tools/perf/util/tool.c
+++ b/tools/perf/util/tool.c
@@ -266,6 +266,7 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events)
 	tool->read = process_event_sample_stub;
 	tool->throttle = process_event_stub;
 	tool->unthrottle = process_event_stub;
+	tool->callchain_deferred = process_event_sample_stub;
 	tool->attr = process_event_synth_attr_stub;
 	tool->event_update = process_event_synth_event_update_stub;
 	tool->tracing_data = process_event_synth_tracing_data_stub;
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index db1c7642b0d1..9987bbde6d5e 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -42,7 +42,8 @@ enum show_feature_header {
 
 struct perf_tool {
 	event_sample	sample,
-			read;
+			read,
+			callchain_deferred;
 	event_op	mmap,
 			mmap2,
 			comm,
-- 
2.47.2
Re: [PATCH v12 08/11] perf tools: Minimal CALLCHAIN_DEFERRED support
Posted by Jens Remus 3 months, 1 week ago
Hello Steve!

On 01.07.2025 20:04, Steven Rostedt wrote:
> From: Namhyung Kim <namhyung@kernel.org>
> 
> Add a new event type for deferred callchains and a new callback for the
> struct perf_tool.  For now it doesn't actually handle the deferred
> callchains but it just marks the sample if it has the PERF_CONTEXT_
> USER_DEFFERED in the callchain array.

> diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h

> @@ -151,6 +151,12 @@ struct perf_record_switch {
>  	__u32			 next_prev_tid;
>  };
>  
> +struct perf_record_callchain_deferred {
> +	struct perf_event_header header;

At minimum the timestamp field added to perf with "[PATCH v12 07/11]
perf: Support deferred user callchains for per CPU events" needs to be
added here as well:

	__u64			 timestamp;

Otherwise this and any subsequent enhancements of the perf tools do no
longer work at all.  But probably the timestamp field also needs to be
used for some purpose?

> +	__u64			 nr;
> +	__u64			 ips[];
> +};
> +

Thanks and regards,
Jens
-- 
Jens Remus
Linux on Z Development (D3303)
+49-7031-16-1128 Office
jremus@de.ibm.com

IBM

IBM Deutschland Research & Development GmbH; Vorsitzender des Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294
IBM Data Privacy Statement: https://www.ibm.com/privacy/

Re: [PATCH v12 08/11] perf tools: Minimal CALLCHAIN_DEFERRED support
Posted by Steven Rostedt 3 months, 1 week ago
On Wed, 2 Jul 2025 14:23:24 +0200
Jens Remus <jremus@linux.ibm.com> wrote:

> > +struct perf_record_callchain_deferred {
> > +	struct perf_event_header header;  
> 
> At minimum the timestamp field added to perf with "[PATCH v12 07/11]
> perf: Support deferred user callchains for per CPU events" needs to be
> added here as well:

Thanks for the review.

> 
> 	__u64			 timestamp;
> 
> Otherwise this and any subsequent enhancements of the perf tools do no
> longer work at all.  But probably the timestamp field also needs to be
> used for some purpose?

OK, so this may be part of the discussion about using this as a
timestamp. The timestamp is the timestamp given by the deferred trace
infrastructure. It holds the time that this stack trace is valid for.
But that's assuming that the timestamp is the same as what perf is
using.

In case of dropped events, we could have the case of:

  system_call() {
    <nmi> {
      take kernel stack trace
      ask for deferred trace.

   [EVENTS START DROPPING HERE]
    }
    Call deferred callback to record trace [ BUT IS DROPPED ]
  }

  system_call() {
    <nmi> {
      take kernel stack trace
      ask for deferred trace [ STILL DROPPING ]
    }
    [ READER CATCHES UP AND STARTS READING EVENTS AGAIN]

    Call deferred callback to record trace
  }

The user space tool will see that kernel stack traces of the first
system call, then it will see events dropped, and then it will see the
deferred user space stack trace of the second call.

If the timestamps are in sync with perf and what is passed in, then the
tool will see that the kernel stack traces from the first system call
are older than the timestamp of the user stack trace and know that they
are not related.

Without either saving the timestamp along with the kernel stack traces,
or having the timestamps in sync, user space will not know whether or
not if the user space stack trace of the second system call belongs to
the kernel stack traces that were recorded in the first system call.

I guess the question is, do we just not associate stack traces beyond
where events are dropped? If so, we don't even need to save the
timestamp.

-- Steve


> 
> > +	__u64			 nr;
> > +	__u64			 ips[];
> > +};