[PATCH v5 13/17] perf: Support deferred user callchains

Steven Rostedt posted 17 patches 7 months, 4 weeks ago
There is a newer version of this series
[PATCH v5 13/17] perf: Support deferred user callchains
Posted by Steven Rostedt 7 months, 4 weeks ago
From: Josh Poimboeuf <jpoimboe@kernel.org>

Use the new unwind_deferred_trace() interface (if available) to defer
unwinds to task context.  This will allow the use of .sframe (when it
becomes available) and also prevents duplicate userspace unwinds.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Co-developed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 arch/Kconfig                          |   3 +
 include/linux/perf_event.h            |  13 +-
 include/uapi/linux/perf_event.h       |  19 ++-
 kernel/bpf/stackmap.c                 |   4 +-
 kernel/events/callchain.c             |  11 +-
 kernel/events/core.c                  | 163 +++++++++++++++++++++++++-
 tools/include/uapi/linux/perf_event.h |  19 ++-
 7 files changed, 224 insertions(+), 8 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index dbb1cc89e040..681946b5f2c4 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -446,6 +446,9 @@ config HAVE_UNWIND_USER_COMPAT_FP
 	bool
 	depends on HAVE_UNWIND_USER_FP
 
+config HAVE_PERF_CALLCHAIN_DEFERRED
+	bool
+
 config HAVE_PERF_REGS
 	bool
 	help
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3cc0b0ea0afa..564594548d82 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -62,6 +62,7 @@ struct perf_guest_info_callbacks {
 #include <linux/security.h>
 #include <linux/static_call.h>
 #include <linux/lockdep.h>
+#include <linux/unwind_deferred.h>
 #include <asm/local.h>
 
 struct perf_callchain_entry {
@@ -830,6 +831,10 @@ struct perf_event {
 	struct callback_head		pending_task;
 	unsigned int			pending_work;
 
+	unsigned int			pending_unwind_callback;
+	struct callback_head		pending_unwind_work;
+	struct rcuwait			pending_unwind_wait;
+
 	atomic_t			event_limit;
 
 	/* address range filters */
@@ -1652,12 +1657,18 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p
 extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-		   u32 max_stack, bool crosstask, bool add_mark);
+		   u32 max_stack, bool crosstask, bool add_mark, bool defer_user);
 extern int get_callchain_buffers(int max_stack);
 extern void put_callchain_buffers(void);
 extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
 extern void put_callchain_entry(int rctx);
 
+#ifdef CONFIG_HAVE_PERF_CALLCHAIN_DEFERRED
+extern void perf_callchain_user_deferred(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
+#else
+static inline void perf_callchain_user_deferred(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) {}
+#endif
+
 extern int sysctl_perf_event_max_stack;
 extern int sysctl_perf_event_max_contexts_per_stack;
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 5fc753c23734..65fe495c012e 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -462,7 +462,8 @@ struct perf_event_attr {
 				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
 				remove_on_exec :  1, /* event is removed from task on exec */
 				sigtrap        :  1, /* send synchronous SIGTRAP on event */
-				__reserved_1   : 26;
+				defer_callchain:  1, /* generate PERF_RECORD_CALLCHAIN_DEFERRED records */
+				__reserved_1   : 25;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -1228,6 +1229,21 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_AUX_OUTPUT_HW_ID		= 21,
 
+	/*
+	 * This user callchain capture was deferred until shortly before
+	 * returning to user space.  Previous samples would have kernel
+	 * callchains only and they need to be stitched with this to make full
+	 * callchains.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				nr;
+	 *	u64				ips[nr];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_CALLCHAIN_DEFERRED		= 22,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
@@ -1258,6 +1274,7 @@ enum perf_callchain_context {
 	PERF_CONTEXT_HV			= (__u64)-32,
 	PERF_CONTEXT_KERNEL		= (__u64)-128,
 	PERF_CONTEXT_USER		= (__u64)-512,
+	PERF_CONTEXT_USER_DEFERRED	= (__u64)-640,
 
 	PERF_CONTEXT_GUEST		= (__u64)-2048,
 	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index ec3a57a5fba1..339f7cbbcf36 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 		max_depth = sysctl_perf_event_max_stack;
 
 	trace = get_perf_callchain(regs, kernel, user, max_depth,
-				   false, false);
+				   false, false, false);
 
 	if (unlikely(!trace))
 		/* couldn't fetch the stack trace */
@@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 		trace = get_callchain_entry_for_task(task, max_depth);
 	else
 		trace = get_perf_callchain(regs, kernel, user, max_depth,
-					   crosstask, false);
+					   crosstask, false, false);
 
 	if (unlikely(!trace) || trace->nr < skip) {
 		if (may_fault)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index bba7f8540ade..6645b2a76ba9 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-		   u32 max_stack, bool crosstask, bool add_mark)
+		   u32 max_stack, bool crosstask, bool add_mark, bool defer_user)
 {
 	struct perf_callchain_entry *entry;
 	struct perf_callchain_entry_ctx ctx;
@@ -251,6 +251,15 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
 			regs = task_pt_regs(current);
 		}
 
+		if (defer_user) {
+			/*
+			 * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
+			 * which can be stitched to this one.
+			 */
+			perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
+			goto exit_put;
+		}
+
 		if (add_mark)
 			perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 40b98b05fb7c..69dd13c62b1d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5537,6 +5537,31 @@ static bool exclusive_event_installable(struct perf_event *event,
 	return true;
 }
 
+static void perf_pending_unwind_sync(struct perf_event *event)
+{
+	might_sleep();
+
+	if (!event->pending_unwind_callback)
+		return;
+
+	/*
+	 * If the task is queued to the current task's queue, we
+	 * obviously can't wait for it to complete. Simply cancel it.
+	 */
+	if (task_work_cancel(current, &event->pending_unwind_work)) {
+		event->pending_unwind_callback = 0;
+		local_dec(&event->ctx->nr_no_switch_fast);
+		return;
+	}
+
+	/*
+	 * All accesses related to the event are within the same RCU section in
+	 * perf_event_callchain_deferred(). The RCU grace period before the
+	 * event is freed will make sure all those accesses are complete by then.
+	 */
+	rcuwait_wait_event(&event->pending_unwind_wait, !event->pending_unwind_callback, TASK_UNINTERRUPTIBLE);
+}
+
 static void perf_free_addr_filters(struct perf_event *event);
 
 /* vs perf_event_alloc() error */
@@ -5604,6 +5629,7 @@ static void _free_event(struct perf_event *event)
 {
 	irq_work_sync(&event->pending_irq);
 	irq_work_sync(&event->pending_disable_irq);
+	perf_pending_unwind_sync(event);
 
 	unaccount_event(event);
 
@@ -7248,6 +7274,65 @@ static void perf_pending_irq(struct irq_work *entry)
 		perf_swevent_put_recursion_context(rctx);
 }
 
+
+struct perf_callchain_deferred_event {
+	struct perf_event_header	header;
+	u64				nr;
+	u64				ips[];
+};
+
+static void perf_event_callchain_deferred(struct callback_head *work)
+{
+	struct perf_event *event = container_of(work, struct perf_event, pending_unwind_work);
+	struct perf_callchain_deferred_event deferred_event;
+	u64 callchain_context = PERF_CONTEXT_USER;
+	struct unwind_stacktrace trace;
+	struct perf_output_handle handle;
+	struct perf_sample_data data;
+	u64 nr;
+
+	if (!event->pending_unwind_callback)
+		return;
+
+	if (unwind_deferred_trace(&trace) < 0)
+		goto out;
+
+	/*
+	 * All accesses to the event must belong to the same implicit RCU
+	 * read-side critical section as the ->pending_unwind_callback reset.
+	 * See comment in perf_pending_unwind_sync().
+	 */
+	guard(rcu)();
+
+	if (!current->mm)
+		goto out;
+
+	nr = trace.nr + 1 ; /* '+1' == callchain_context */
+
+	deferred_event.header.type = PERF_RECORD_CALLCHAIN_DEFERRED;
+	deferred_event.header.misc = PERF_RECORD_MISC_USER;
+	deferred_event.header.size = sizeof(deferred_event) + (nr * sizeof(u64));
+
+	deferred_event.nr = nr;
+
+	perf_event_header__init_id(&deferred_event.header, &data, event);
+
+	if (perf_output_begin(&handle, &data, event, deferred_event.header.size))
+		goto out;
+
+	perf_output_put(&handle, deferred_event);
+	perf_output_put(&handle, callchain_context);
+	perf_output_copy(&handle, trace.entries, trace.nr * sizeof(u64));
+	perf_event__output_id_sample(event, &handle, &data);
+
+	perf_output_end(&handle);
+
+out:
+	event->pending_unwind_callback = 0;
+	local_dec(&event->ctx->nr_no_switch_fast);
+	rcuwait_wake_up(&event->pending_unwind_wait);
+}
+
 static void perf_pending_task(struct callback_head *head)
 {
 	struct perf_event *event = container_of(head, struct perf_event, pending_task);
@@ -8097,6 +8182,61 @@ static u64 perf_get_page_size(unsigned long addr)
 
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
+/* Returns the same as deferred_request() below */
+static int deferred_request_nmi(struct perf_event *event)
+{
+	struct callback_head *work = &event->pending_unwind_work;
+	int ret;
+
+	if (event->pending_unwind_callback)
+		return 1;
+
+	ret = task_work_add(current, work, TWA_NMI_CURRENT);
+	if (ret)
+		return ret;
+
+	event->pending_unwind_callback = 1;
+	return 0;
+}
+
+/*
+ * Returns:
+*     > 0 : if already queued.
+ *      0 : if it performed the queuing
+ *    < 0 : if it did not get queued.
+ */
+static int deferred_request(struct perf_event *event)
+{
+	struct callback_head *work = &event->pending_unwind_work;
+	int pending;
+	int ret;
+
+	if (!current->mm || !user_mode(task_pt_regs(current)))
+		return -EINVAL;
+
+	if (in_nmi())
+		return deferred_request_nmi(event);
+
+	guard(irqsave)();
+
+        /* callback already pending? */
+        pending = READ_ONCE(event->pending_unwind_callback);
+        if (pending)
+                return 1;
+
+        /* Claim the work unless an NMI just now swooped in to do so. */
+        if (!try_cmpxchg(&event->pending_unwind_callback, &pending, 1))
+                return 1;
+
+        /* The work has been claimed, now schedule it. */
+        ret = task_work_add(current, work, TWA_RESUME);
+        if (WARN_ON_ONCE(ret)) {
+                WRITE_ONCE(event->pending_unwind_callback, 0);
+                return ret;
+        }
+	return 0;
+}
+
 struct perf_callchain_entry *
 perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
@@ -8106,12 +8246,27 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 	bool crosstask = event->ctx->task && event->ctx->task != current;
 	const u32 max_stack = event->attr.sample_max_stack;
 	struct perf_callchain_entry *callchain;
+	bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
+			  event->attr.defer_callchain;
 
 	if (!kernel && !user)
 		return &__empty_callchain;
 
-	callchain = get_perf_callchain(regs, kernel, user,
-				       max_stack, crosstask, true);
+	/* Disallow cross-task callchains. */
+	if (event->ctx->task && event->ctx->task != current)
+		return &__empty_callchain;
+
+	if (defer_user) {
+		int ret = deferred_request(event);
+		if (!ret)
+			local_inc(&event->ctx->nr_no_switch_fast);
+		else if (ret < 0)
+			defer_user = false;
+	}
+
+	callchain = get_perf_callchain(regs, kernel, user, max_stack,
+				       crosstask, true, defer_user);
+
 	return callchain ?: &__empty_callchain;
 }
 
@@ -12943,6 +13098,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	if (err)
 		return ERR_PTR(err);
 
+	if (event->attr.defer_callchain)
+		init_task_work(&event->pending_unwind_work,
+			       perf_event_callchain_deferred);
+
 	/* symmetric to unaccount_event() in _free_event() */
 	account_event(event);
 
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 5fc753c23734..65fe495c012e 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -462,7 +462,8 @@ struct perf_event_attr {
 				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
 				remove_on_exec :  1, /* event is removed from task on exec */
 				sigtrap        :  1, /* send synchronous SIGTRAP on event */
-				__reserved_1   : 26;
+				defer_callchain:  1, /* generate PERF_RECORD_CALLCHAIN_DEFERRED records */
+				__reserved_1   : 25;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -1228,6 +1229,21 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_AUX_OUTPUT_HW_ID		= 21,
 
+	/*
+	 * This user callchain capture was deferred until shortly before
+	 * returning to user space.  Previous samples would have kernel
+	 * callchains only and they need to be stitched with this to make full
+	 * callchains.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				nr;
+	 *	u64				ips[nr];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_CALLCHAIN_DEFERRED		= 22,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
@@ -1258,6 +1274,7 @@ enum perf_callchain_context {
 	PERF_CONTEXT_HV			= (__u64)-32,
 	PERF_CONTEXT_KERNEL		= (__u64)-128,
 	PERF_CONTEXT_USER		= (__u64)-512,
+	PERF_CONTEXT_USER_DEFERRED	= (__u64)-640,
 
 	PERF_CONTEXT_GUEST		= (__u64)-2048,
 	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
-- 
2.47.2
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Steven Rostedt 7 months, 2 weeks ago
On Thu, 24 Apr 2025 12:25:42 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> +static void perf_event_callchain_deferred(struct callback_head *work)
> +{
> +	struct perf_event *event = container_of(work, struct perf_event, pending_unwind_work);
> +	struct perf_callchain_deferred_event deferred_event;
> +	u64 callchain_context = PERF_CONTEXT_USER;
> +	struct unwind_stacktrace trace;
> +	struct perf_output_handle handle;
> +	struct perf_sample_data data;
> +	u64 nr;
> +
> +	if (!event->pending_unwind_callback)
> +		return;
> +
> +	if (unwind_deferred_trace(&trace) < 0)
> +		goto out;
> +
> +	/*
> +	 * All accesses to the event must belong to the same implicit RCU
> +	 * read-side critical section as the ->pending_unwind_callback reset.
> +	 * See comment in perf_pending_unwind_sync().
> +	 */
> +	guard(rcu)();
> +
> +	if (!current->mm)
> +		goto out;
> +
> +	nr = trace.nr + 1 ; /* '+1' == callchain_context */

Hi Namhyung,

Talking with Beau about how Microsoft does their own deferred tracing, I
wonder if the timestamp approach would be useful.

This is where a timestamp is taken at the first request for a deferred
trace, and this is recorded in the trace when it happens. It basically
states that "this trace is good up until the given timestamp".

The rationale for this is for lost events. Let's say you have:

  <task enters kernel>
    Request deferred trace

    <buffer fills up and events start to get lost>

    Deferred trace happens (but is dropped due to buffer being full)

  <task exits kernel>

  <task enters kernel again>
    Request deferred trace  (Still dropped due to buffer being full)

    <Reader catches up and buffer is free again>

    Deferred trace happens (this time it is recorded>
  <task exits kernel>

How would user space know that the deferred trace that was recorded doesn't
go with the request (and kernel stack trace) that was done initially)?

If we add a timestamp, then it would look like:

  <task enters kernel>
    Request deferred trace
    [Record timestamp]

    <buffer fills up and events start to get lost>

    Deferred trace happens with timestamp (but is dropped due to buffer being full)

  <task exits kernel>

  <task enters kernel again>
    Request deferred trace  (Still dropped due to buffer being full)
    [Record timestamp]

    <Reader catches up and buffer is free again>

    Deferred trace happens with timestamp (this time it is recorded>
  <task exits kernel>

Then user space will look at the timestamp that was recorded and know that
it's not for the initial request because the timestamp of the kernel stack
trace done was before the timestamp of the user space stacktrace and
therefore is not valid for the kernel stacktrace.

The timestamp would become zero when exiting to user space. The first
request will add it but would need a cmpxchg to do so, and if the cmpxchg
fails, it then needs to check if the one recorded is before the current
one, and if it isn't it still needs to update the timestamp (this is to
handle races with NMIs).

Basically, the timestamp would replace the cookie method.

Thoughts?

-- Steve


> +
> +	deferred_event.header.type = PERF_RECORD_CALLCHAIN_DEFERRED;
> +	deferred_event.header.misc = PERF_RECORD_MISC_USER;
> +	deferred_event.header.size = sizeof(deferred_event) + (nr * sizeof(u64));
> +
> +	deferred_event.nr = nr;
> +
> +	perf_event_header__init_id(&deferred_event.header, &data, event);
> +
> +	if (perf_output_begin(&handle, &data, event, deferred_event.header.size))
> +		goto out;
> +
> +	perf_output_put(&handle, deferred_event);
> +	perf_output_put(&handle, callchain_context);
> +	perf_output_copy(&handle, trace.entries, trace.nr * sizeof(u64));
> +	perf_event__output_id_sample(event, &handle, &data);
> +
> +	perf_output_end(&handle);
> +
> +out:
> +	event->pending_unwind_callback = 0;
> +	local_dec(&event->ctx->nr_no_switch_fast);
> +	rcuwait_wake_up(&event->pending_unwind_wait);
> +}
> +
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Mathieu Desnoyers 7 months, 2 weeks ago
On 2025-05-08 12:03, Steven Rostedt wrote:
> On Thu, 24 Apr 2025 12:25:42 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
>> +static void perf_event_callchain_deferred(struct callback_head *work)
>> +{
>> +	struct perf_event *event = container_of(work, struct perf_event, pending_unwind_work);
>> +	struct perf_callchain_deferred_event deferred_event;
>> +	u64 callchain_context = PERF_CONTEXT_USER;
>> +	struct unwind_stacktrace trace;
>> +	struct perf_output_handle handle;
>> +	struct perf_sample_data data;
>> +	u64 nr;
>> +
>> +	if (!event->pending_unwind_callback)
>> +		return;
>> +
>> +	if (unwind_deferred_trace(&trace) < 0)
>> +		goto out;
>> +
>> +	/*
>> +	 * All accesses to the event must belong to the same implicit RCU
>> +	 * read-side critical section as the ->pending_unwind_callback reset.
>> +	 * See comment in perf_pending_unwind_sync().
>> +	 */
>> +	guard(rcu)();
>> +
>> +	if (!current->mm)
>> +		goto out;
>> +
>> +	nr = trace.nr + 1 ; /* '+1' == callchain_context */
> 
> Hi Namhyung,
> 
> Talking with Beau about how Microsoft does their own deferred tracing, I
> wonder if the timestamp approach would be useful.
> 
> This is where a timestamp is taken at the first request for a deferred
> trace, and this is recorded in the trace when it happens. It basically
> states that "this trace is good up until the given timestamp".
> 
> The rationale for this is for lost events. Let's say you have:
> 
>    <task enters kernel>
>      Request deferred trace
> 
>      <buffer fills up and events start to get lost>
> 
>      Deferred trace happens (but is dropped due to buffer being full)
> 
>    <task exits kernel>
> 
>    <task enters kernel again>
>      Request deferred trace  (Still dropped due to buffer being full)
> 
>      <Reader catches up and buffer is free again>
> 
>      Deferred trace happens (this time it is recorded>
>    <task exits kernel>
> 
> How would user space know that the deferred trace that was recorded doesn't
> go with the request (and kernel stack trace) that was done initially)?
> 
> If we add a timestamp, then it would look like:
> 
>    <task enters kernel>
>      Request deferred trace
>      [Record timestamp]
> 
>      <buffer fills up and events start to get lost>
> 
>      Deferred trace happens with timestamp (but is dropped due to buffer being full)
> 
>    <task exits kernel>
> 
>    <task enters kernel again>
>      Request deferred trace  (Still dropped due to buffer being full)
>      [Record timestamp]
> 
>      <Reader catches up and buffer is free again>
> 
>      Deferred trace happens with timestamp (this time it is recorded>
>    <task exits kernel>
> 
> Then user space will look at the timestamp that was recorded and know that
> it's not for the initial request because the timestamp of the kernel stack
> trace done was before the timestamp of the user space stacktrace and
> therefore is not valid for the kernel stacktrace.
> 
> The timestamp would become zero when exiting to user space. The first
> request will add it but would need a cmpxchg to do so, and if the cmpxchg
> fails, it then needs to check if the one recorded is before the current
> one, and if it isn't it still needs to update the timestamp (this is to
> handle races with NMIs).
> 
> Basically, the timestamp would replace the cookie method.
> 
> Thoughts?

AFAIR, the cookie method generates the cookie by combining the cpu
number with a per-cpu count.

This ensures that there are not two cookies emitted at the same time
from two CPUs that have the same value by accident.

How would the timestamp method prevent this ?

Thanks,

Mathieu

> 
> -- Steve
> 
> 
>> +
>> +	deferred_event.header.type = PERF_RECORD_CALLCHAIN_DEFERRED;
>> +	deferred_event.header.misc = PERF_RECORD_MISC_USER;
>> +	deferred_event.header.size = sizeof(deferred_event) + (nr * sizeof(u64));
>> +
>> +	deferred_event.nr = nr;
>> +
>> +	perf_event_header__init_id(&deferred_event.header, &data, event);
>> +
>> +	if (perf_output_begin(&handle, &data, event, deferred_event.header.size))
>> +		goto out;
>> +
>> +	perf_output_put(&handle, deferred_event);
>> +	perf_output_put(&handle, callchain_context);
>> +	perf_output_copy(&handle, trace.entries, trace.nr * sizeof(u64));
>> +	perf_event__output_id_sample(event, &handle, &data);
>> +
>> +	perf_output_end(&handle);
>> +
>> +out:
>> +	event->pending_unwind_callback = 0;
>> +	local_dec(&event->ctx->nr_no_switch_fast);
>> +	rcuwait_wake_up(&event->pending_unwind_wait);
>> +}
>> +


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Steven Rostedt 7 months, 2 weeks ago
On Thu, 8 May 2025 14:49:59 -0400
Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:

> AFAIR, the cookie method generates the cookie by combining the cpu
> number with a per-cpu count.
> 
> This ensures that there are not two cookies emitted at the same time
> from two CPUs that have the same value by accident.
> 
> How would the timestamp method prevent this ?

Do we care? It only needs to be unique per pid doesn't it?

-- Steve
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Mathieu Desnoyers 7 months, 1 week ago
On 2025-05-08 14:54, Steven Rostedt wrote:
> On Thu, 8 May 2025 14:49:59 -0400
> Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
> 
>> AFAIR, the cookie method generates the cookie by combining the cpu
>> number with a per-cpu count.
>>
>> This ensures that there are not two cookies emitted at the same time
>> from two CPUs that have the same value by accident.
>>
>> How would the timestamp method prevent this ?
> 
> Do we care? It only needs to be unique per pid doesn't it?

Is it possible to have many threads writing into the same
ring buffer in that scenario ? Are all event records stamped
with their associated PID ? As long as we have enough information
to know which thread was associated with the timestamp cookie
on both ends (request for callchain and saving the user callchain
on return to userspace), we should be OK.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Namhyung Kim 7 months, 1 week ago
Hi Mathieu,

On Fri, May 09, 2025 at 08:23:49AM -0400, Mathieu Desnoyers wrote:
> On 2025-05-08 14:54, Steven Rostedt wrote:
> > On Thu, 8 May 2025 14:49:59 -0400
> > Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
> > 
> > > AFAIR, the cookie method generates the cookie by combining the cpu
> > > number with a per-cpu count.
> > > 
> > > This ensures that there are not two cookies emitted at the same time
> > > from two CPUs that have the same value by accident.
> > > 
> > > How would the timestamp method prevent this ?
> > 
> > Do we care? It only needs to be unique per pid doesn't it?
> 
> Is it possible to have many threads writing into the same
> ring buffer in that scenario ? Are all event records stamped
> with their associated PID ? As long as we have enough information
> to know which thread was associated with the timestamp cookie
> on both ends (request for callchain and saving the user callchain
> on return to userspace), we should be OK.

Yep, basically perf sets PERF_SAMPLE_TID (and sample_id_all) which makes
every records come with PID/TID.. 

Thanks,
Namhyung
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Steven Rostedt 7 months, 1 week ago
On Fri, 9 May 2025 08:45:46 -0700
Namhyung Kim <namhyung@kernel.org> wrote:

> > Is it possible to have many threads writing into the same
> > ring buffer in that scenario ? Are all event records stamped
> > with their associated PID ? As long as we have enough information
> > to know which thread was associated with the timestamp cookie
> > on both ends (request for callchain and saving the user callchain
> > on return to userspace), we should be OK.  
> 
> Yep, basically perf sets PERF_SAMPLE_TID (and sample_id_all) which makes
> every records come with PID/TID.. 

If LTTng is fine with this too, then we should be OK as perf and ftrace
both record TID in the events. I'm assuming that BPF would be OK too.

-- Steve
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Namhyung Kim 7 months, 2 weeks ago
Hi Steve,

On Thu, May 08, 2025 at 12:03:21PM -0400, Steven Rostedt wrote:
> On Thu, 24 Apr 2025 12:25:42 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > +static void perf_event_callchain_deferred(struct callback_head *work)
> > +{
> > +	struct perf_event *event = container_of(work, struct perf_event, pending_unwind_work);
> > +	struct perf_callchain_deferred_event deferred_event;
> > +	u64 callchain_context = PERF_CONTEXT_USER;
> > +	struct unwind_stacktrace trace;
> > +	struct perf_output_handle handle;
> > +	struct perf_sample_data data;
> > +	u64 nr;
> > +
> > +	if (!event->pending_unwind_callback)
> > +		return;
> > +
> > +	if (unwind_deferred_trace(&trace) < 0)
> > +		goto out;
> > +
> > +	/*
> > +	 * All accesses to the event must belong to the same implicit RCU
> > +	 * read-side critical section as the ->pending_unwind_callback reset.
> > +	 * See comment in perf_pending_unwind_sync().
> > +	 */
> > +	guard(rcu)();
> > +
> > +	if (!current->mm)
> > +		goto out;
> > +
> > +	nr = trace.nr + 1 ; /* '+1' == callchain_context */
> 
> Hi Namhyung,
> 
> Talking with Beau about how Microsoft does their own deferred tracing, I
> wonder if the timestamp approach would be useful.
> 
> This is where a timestamp is taken at the first request for a deferred
> trace, and this is recorded in the trace when it happens. It basically
> states that "this trace is good up until the given timestamp".
> 
> The rationale for this is for lost events. Let's say you have:
> 
>   <task enters kernel>
>     Request deferred trace
> 
>     <buffer fills up and events start to get lost>
> 
>     Deferred trace happens (but is dropped due to buffer being full)
> 
>   <task exits kernel>
> 
>   <task enters kernel again>
>     Request deferred trace  (Still dropped due to buffer being full)
> 
>     <Reader catches up and buffer is free again>
> 
>     Deferred trace happens (this time it is recorded>
>   <task exits kernel>
> 
> How would user space know that the deferred trace that was recorded doesn't
> go with the request (and kernel stack trace) that was done initially)?

Right, this is a problem.

> 
> If we add a timestamp, then it would look like:
> 
>   <task enters kernel>
>     Request deferred trace
>     [Record timestamp]
> 
>     <buffer fills up and events start to get lost>
> 
>     Deferred trace happens with timestamp (but is dropped due to buffer being full)
> 
>   <task exits kernel>
> 
>   <task enters kernel again>
>     Request deferred trace  (Still dropped due to buffer being full)
>     [Record timestamp]
> 
>     <Reader catches up and buffer is free again>
> 
>     Deferred trace happens with timestamp (this time it is recorded>
>   <task exits kernel>
> 
> Then user space will look at the timestamp that was recorded and know that
> it's not for the initial request because the timestamp of the kernel stack
> trace done was before the timestamp of the user space stacktrace and
> therefore is not valid for the kernel stacktrace.

IIUC the deferred stacktrace will have the timestamp of the first
request, right?

> 
> The timestamp would become zero when exiting to user space. The first
> request will add it but would need a cmpxchg to do so, and if the cmpxchg
> fails, it then needs to check if the one recorded is before the current
> one, and if it isn't it still needs to update the timestamp (this is to
> handle races with NMIs).

Yep, it needs to maintain an accurate first timestamp.

> 
> Basically, the timestamp would replace the cookie method.
> 
> Thoughts?

Sounds good to me.  You'll need to add it to the
PERF_RECORD_DEFERRED_CALLCHAIN.  Probably it should check if sample_type
has PERF_SAMPLE_TIME.  It'd work along with PERF_SAMPLE_TID (which will
be added by the perf tools anyway).
 
Thanks,
Namhyung

> 
> > +
> > +	deferred_event.header.type = PERF_RECORD_CALLCHAIN_DEFERRED;
> > +	deferred_event.header.misc = PERF_RECORD_MISC_USER;
> > +	deferred_event.header.size = sizeof(deferred_event) + (nr * sizeof(u64));
> > +
> > +	deferred_event.nr = nr;
> > +
> > +	perf_event_header__init_id(&deferred_event.header, &data, event);
> > +
> > +	if (perf_output_begin(&handle, &data, event, deferred_event.header.size))
> > +		goto out;
> > +
> > +	perf_output_put(&handle, deferred_event);
> > +	perf_output_put(&handle, callchain_context);
> > +	perf_output_copy(&handle, trace.entries, trace.nr * sizeof(u64));
> > +	perf_event__output_id_sample(event, &handle, &data);
> > +
> > +	perf_output_end(&handle);
> > +
> > +out:
> > +	event->pending_unwind_callback = 0;
> > +	local_dec(&event->ctx->nr_no_switch_fast);
> > +	rcuwait_wake_up(&event->pending_unwind_wait);
> > +}
> > +
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Namhyung Kim 7 months, 3 weeks ago
Hello,

On Thu, Apr 24, 2025 at 12:25:42PM -0400, Steven Rostedt wrote:
> From: Josh Poimboeuf <jpoimboe@kernel.org>
> 
> Use the new unwind_deferred_trace() interface (if available) to defer
> unwinds to task context.  This will allow the use of .sframe (when it
> becomes available) and also prevents duplicate userspace unwinds.
> 
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Co-developed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
[SNIP]
> +/*
> + * Returns:
> +*     > 0 : if already queued.
> + *      0 : if it performed the queuing
> + *    < 0 : if it did not get queued.
> + */
> +static int deferred_request(struct perf_event *event)
> +{
> +	struct callback_head *work = &event->pending_unwind_work;
> +	int pending;
> +	int ret;

I'm not sure if it works for per-CPU events.  The event is shared so any
task can request the deferred callchains.  Does it handle if task A
requests one and scheduled out before going to the user mode, and task B
on the CPU also requests another after that?  I'm afraid not..

> +
> +	if (!current->mm || !user_mode(task_pt_regs(current)))
> +		return -EINVAL;

Does it mean it cannot use deferred callstack when it's in the kernel
mode like during a syscall?

Thanks,
Namhyung

> +
> +	if (in_nmi())
> +		return deferred_request_nmi(event);
> +
> +	guard(irqsave)();
> +
> +        /* callback already pending? */
> +        pending = READ_ONCE(event->pending_unwind_callback);
> +        if (pending)
> +                return 1;
> +
> +        /* Claim the work unless an NMI just now swooped in to do so. */
> +        if (!try_cmpxchg(&event->pending_unwind_callback, &pending, 1))
> +                return 1;
> +
> +        /* The work has been claimed, now schedule it. */
> +        ret = task_work_add(current, work, TWA_RESUME);
> +        if (WARN_ON_ONCE(ret)) {
> +                WRITE_ONCE(event->pending_unwind_callback, 0);
> +                return ret;
> +        }
> +	return 0;
> +}
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Steven Rostedt 7 months, 3 weeks ago
On Fri, 25 Apr 2025 08:24:47 -0700
Namhyung Kim <namhyung@kernel.org> wrote:
> > +/*
> > + * Returns:
> > +*     > 0 : if already queued.
> > + *      0 : if it performed the queuing
> > + *    < 0 : if it did not get queued.
> > + */
> > +static int deferred_request(struct perf_event *event)
> > +{
> > +	struct callback_head *work = &event->pending_unwind_work;
> > +	int pending;
> > +	int ret;  
> 
> I'm not sure if it works for per-CPU events.  The event is shared so any
> task can request the deferred callchains.  Does it handle if task A
> requests one and scheduled out before going to the user mode, and task B
> on the CPU also requests another after that?  I'm afraid not..

I was afraid of that.

This is no different that what Josh did in his last set in v4. I'm guessing
the issue is running with "-a", correct?

Could we just not use deferred when running with "-a" for now? Or could we
possibly just make the deferred stacktrace its own event? Have it be
possible that perf just registers a signal instance with the deferred
unwinding logic, and then perf can handle where to write the information. I
don't know perf well enough to implement that.

Josh's code had it call the unwind_deferred_init() and just used its own
event to callback to and that was called on hundreds of events when I ran:

  perf record -g <whatever>

Same if I added the "-a" option.

The above function return values came from Josh's code:

  https://git.kernel.org/pub/scm/linux/kernel/git/jpoimboe/linux.git/tree/kernel/unwind/deferred.c?h=sframe#n173

I just moved it out of deferred.c and into perf itself, and removed the
cookie logic.

> 
> > +
> > +	if (!current->mm || !user_mode(task_pt_regs(current)))
> > +		return -EINVAL;  
> 
> Does it mean it cannot use deferred callstack when it's in the kernel
> mode like during a syscall?

task_pt_regs(current) will return the regs from when the task entered the
kernel. So the answer is no, it will still trace if an interrupt happened
while a task is in a system call.

-- Steve
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Namhyung Kim 7 months, 3 weeks ago
On Fri, Apr 25, 2025 at 12:58:15PM -0400, Steven Rostedt wrote:
> On Fri, 25 Apr 2025 08:24:47 -0700
> Namhyung Kim <namhyung@kernel.org> wrote:
> > > +/*
> > > + * Returns:
> > > +*     > 0 : if already queued.
> > > + *      0 : if it performed the queuing
> > > + *    < 0 : if it did not get queued.
> > > + */
> > > +static int deferred_request(struct perf_event *event)
> > > +{
> > > +	struct callback_head *work = &event->pending_unwind_work;
> > > +	int pending;
> > > +	int ret;  
> > 
> > I'm not sure if it works for per-CPU events.  The event is shared so any
> > task can request the deferred callchains.  Does it handle if task A
> > requests one and scheduled out before going to the user mode, and task B
> > on the CPU also requests another after that?  I'm afraid not..
> 
> I was afraid of that.
> 
> This is no different that what Josh did in his last set in v4. I'm guessing
> the issue is running with "-a", correct?

Yes.

> 
> Could we just not use deferred when running with "-a" for now? Or could we
> possibly just make the deferred stacktrace its own event? Have it be
> possible that perf just registers a signal instance with the deferred
> unwinding logic, and then perf can handle where to write the information. I
> don't know perf well enough to implement that.

Even if it excludes per-CPU events, per-task events also can attach to a
CPU and that's the default behavior of the perf record IIRC.  In that
case, it needs to be careful when it accesses the event since the task
can migrate to another CPU.  So I'm not sure if it's a good idea to
track event that requested the deferred callchains.

Also it doesn't need to emit duplicate deferred callchains if a task
has multiple events and they are requesting callchains.  Unfortunately,
the kernel cannot know which events are related or profiled together.

Hmm.. maybe we can add a cookie to the event itself (by ioctl or
something) in order to group events in a profiling session and then use
that for deferred callchains?  Task should maintain a list of active
cookies (or sessions) somehow but then perf can check if the current CPU
has events with matching cookies and emit a deferred callchain.

> 
> Josh's code had it call the unwind_deferred_init() and just used its own
> event to callback to and that was called on hundreds of events when I ran:
> 
>   perf record -g <whatever>
> 
> Same if I added the "-a" option.
> 
> The above function return values came from Josh's code:
> 
>   https://git.kernel.org/pub/scm/linux/kernel/git/jpoimboe/linux.git/tree/kernel/unwind/deferred.c?h=sframe#n173
> 
> I just moved it out of deferred.c and into perf itself, and removed the
> cookie logic.
> 
> > 
> > > +
> > > +	if (!current->mm || !user_mode(task_pt_regs(current)))
> > > +		return -EINVAL;  
> > 
> > Does it mean it cannot use deferred callstack when it's in the kernel
> > mode like during a syscall?
> 
> task_pt_regs(current) will return the regs from when the task entered the
> kernel. So the answer is no, it will still trace if an interrupt happened
> while a task is in a system call.

Ok, thanks for the explanation.  The the user_mode check was for kernel
threads, right?

Thanks,
Namhyung
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Steven Rostedt 7 months, 3 weeks ago
On Mon, 28 Apr 2025 13:42:15 -0700
Namhyung Kim <namhyung@kernel.org> wrote:


> > 
> > Could we just not use deferred when running with "-a" for now? Or could we
> > possibly just make the deferred stacktrace its own event? Have it be
> > possible that perf just registers a signal instance with the deferred
> > unwinding logic, and then perf can handle where to write the information. I
> > don't know perf well enough to implement that.  
> 
> Even if it excludes per-CPU events, per-task events also can attach to a
> CPU and that's the default behavior of the perf record IIRC.  In that
> case, it needs to be careful when it accesses the event since the task
> can migrate to another CPU.  So I'm not sure if it's a good idea to
> track event that requested the deferred callchains.

Wait? Even for per task, it uses per cpu?

> 
> Also it doesn't need to emit duplicate deferred callchains if a task
> has multiple events and they are requesting callchains.  Unfortunately,
> the kernel cannot know which events are related or profiled together.
> 
> Hmm.. maybe we can add a cookie to the event itself (by ioctl or
> something) in order to group events in a profiling session and then use
> that for deferred callchains?  Task should maintain a list of active
> cookies (or sessions) somehow but then perf can check if the current CPU
> has events with matching cookies and emit a deferred callchain.

Could we add a callchain event? It gets woken at any request but not
triggered until the task returns. A way that there would be only a single
event for every perf instance, but it can trace any task.

It could use the cookie method that ftrace uses, where the request gets a
cookie, and can be recorded to the perf event in the interrupt. Then the
callchain would record the cookie along with the stack trace, and then perf
tool could just match up the kernel stacks with their cookies to the user
stack with its cookie.

-- Steve
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Namhyung Kim 7 months, 3 weeks ago
On Mon, Apr 28, 2025 at 06:02:53PM -0400, Steven Rostedt wrote:
> On Mon, 28 Apr 2025 13:42:15 -0700
> Namhyung Kim <namhyung@kernel.org> wrote:
> 
> 
> > > 
> > > Could we just not use deferred when running with "-a" for now? Or could we
> > > possibly just make the deferred stacktrace its own event? Have it be
> > > possible that perf just registers a signal instance with the deferred
> > > unwinding logic, and then perf can handle where to write the information. I
> > > don't know perf well enough to implement that.  
> > 
> > Even if it excludes per-CPU events, per-task events also can attach to a
> > CPU and that's the default behavior of the perf record IIRC.  In that
> > case, it needs to be careful when it accesses the event since the task
> > can migrate to another CPU.  So I'm not sure if it's a good idea to
> > track event that requested the deferred callchains.
> 
> Wait? Even for per task, it uses per cpu?

Yep, it has per-task + per-CPU events.

> 
> > 
> > Also it doesn't need to emit duplicate deferred callchains if a task
> > has multiple events and they are requesting callchains.  Unfortunately,
> > the kernel cannot know which events are related or profiled together.
> > 
> > Hmm.. maybe we can add a cookie to the event itself (by ioctl or
> > something) in order to group events in a profiling session and then use
> > that for deferred callchains?  Task should maintain a list of active
> > cookies (or sessions) somehow but then perf can check if the current CPU
> > has events with matching cookies and emit a deferred callchain.
> 
> Could we add a callchain event? It gets woken at any request but not
> triggered until the task returns. A way that there would be only a single
> event for every perf instance, but it can trace any task.

Thing is that the kernel doesn't know the relationship between events.
For example, if I run this command on a machine with 100 CPUs:

  $ perf record -e cycles,instructions -- $MYPROG

it would open 200 events and they don't know each other.  Later other
process can start a new perf profiling for the same task.  IIUC there's
no way to identify which one is related in the kernel.

So I think we need a way to share some informaiton for those 200 events
and then emits deferred callchain records with the shared info.

> 
> It could use the cookie method that ftrace uses, where the request gets a
> cookie, and can be recorded to the perf event in the interrupt. Then the
> callchain would record the cookie along with the stack trace, and then perf
> tool could just match up the kernel stacks with their cookies to the user
> stack with its cookie.

Yep, but the kernel should know which events (or ring buffer) it should
emit the deferred callchains.  I don't think it needs to include the
cookie in the perf data, but it can be used to find which event or ring
buffer for the session is related to this request.

Thanks,
Namhyung
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Steven Rostedt 7 months, 3 weeks ago
On Mon, 28 Apr 2025 17:29:53 -0700
Namhyung Kim <namhyung@kernel.org> wrote:

> Thing is that the kernel doesn't know the relationship between events.
> For example, if I run this command on a machine with 100 CPUs:
> 
>   $ perf record -e cycles,instructions -- $MYPROG
> 
> it would open 200 events and they don't know each other.  Later other
> process can start a new perf profiling for the same task.  IIUC there's
> no way to identify which one is related in the kernel.
> 
> So I think we need a way to share some informaiton for those 200 events
> and then emits deferred callchain records with the shared info.

Hmm, I'm thinking of creating an internal perf descriptor that would join
events by who created them. That is, the first event created will take the
thread leader (pid of the task) and check if an entity exists for it. If
one doesn't exist it will create it and add itself to that event if it has
a deferred trace attribute set. If it already exists, it will just add
itself to it. This deferred descriptor will register itself with the
deferred unwinder like ftrace does (one per process), and then use it to
defer callbacks. When the callback happens, it will look for the thread
event or CPU event that matches the current thread or current CPU and
record the backtrace there.


> 
> > 
> > It could use the cookie method that ftrace uses, where the request gets a
> > cookie, and can be recorded to the perf event in the interrupt. Then the
> > callchain would record the cookie along with the stack trace, and then perf
> > tool could just match up the kernel stacks with their cookies to the user
> > stack with its cookie.  
> 
> Yep, but the kernel should know which events (or ring buffer) it should
> emit the deferred callchains.  I don't think it needs to include the
> cookie in the perf data, but it can be used to find which event or ring
> buffer for the session is related to this request.

Let me see if my suggestion would work or not. I'll try it out and see what
happens. And post patches later.

-- Steve
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Peter Zijlstra 7 months, 4 weeks ago
On Thu, Apr 24, 2025 at 12:25:42PM -0400, Steven Rostedt wrote:
> +static int deferred_request(struct perf_event *event)
> +{
> +	struct callback_head *work = &event->pending_unwind_work;
> +	int pending;
> +	int ret;
> +
> +	if (!current->mm || !user_mode(task_pt_regs(current)))
> +		return -EINVAL;
> +
> +	if (in_nmi())
> +		return deferred_request_nmi(event);
> +
> +	guard(irqsave)();
> +
> +        /* callback already pending? */
> +        pending = READ_ONCE(event->pending_unwind_callback);
> +        if (pending)
> +                return 1;
> +
> +        /* Claim the work unless an NMI just now swooped in to do so. */
> +        if (!try_cmpxchg(&event->pending_unwind_callback, &pending, 1))
> +                return 1;
> +
> +        /* The work has been claimed, now schedule it. */
> +        ret = task_work_add(current, work, TWA_RESUME);
> +        if (WARN_ON_ONCE(ret)) {
> +                WRITE_ONCE(event->pending_unwind_callback, 0);
> +                return ret;
> +        }
> +	return 0;
> +}

You have whitespace issues ...
Re: [PATCH v5 13/17] perf: Support deferred user callchains
Posted by Steven Rostedt 7 months, 4 weeks ago
On Thu, 24 Apr 2025 18:38:45 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> > +	guard(irqsave)();
> > +
> > +        /* callback already pending? */
> > +        pending = READ_ONCE(event->pending_unwind_callback);
> > +        if (pending)
> > +                return 1;
> > +
> > +        /* Claim the work unless an NMI just now swooped in to do so. */
> > +        if (!try_cmpxchg(&event->pending_unwind_callback, &pending, 1))
> > +                return 1;
> > +
> > +        /* The work has been claimed, now schedule it. */
> > +        ret = task_work_add(current, work, TWA_RESUME);
> > +        if (WARN_ON_ONCE(ret)) {
> > +                WRITE_ONCE(event->pending_unwind_callback, 0);
> > +                return ret;
> > +        }
> > +	return 0;
> > +}  
> 
> You have whitespace issues ...

Bah, that may have been due to cut and paste to fix up things during the
multiple rebases I did.

Thanks!

-- Steve