[v10] unwind_user: x86: Deferred unwinding infrastructure

[PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Steven Rostedt 4 months ago

From: Josh Poimboeuf <jpoimboe@kernel.org>

Add an interface for scheduling task work to unwind the user space stack
before returning to user space. This solves several problems for its
callers:

  - Ensure the unwind happens in task context even if the caller may be
    running in interrupt context.

  - Avoid duplicate unwinds, whether called multiple times by the same
    caller or by different callers.

  - Take a timestamp when the first request comes in since the task
    entered the kernel. This will be returned to the calling function
    along with the stack trace when the task leaves the kernel. This
    timestamp can be used to correlate kernel unwinds/traces with the user
    unwind.

The timestamp is created to detect when the stacktrace is the same. It is
generated the first time a user space stacktrace is requested after the
task enters the kernel.

The timestamp is passed to the caller on request, and when the stacktrace is
generated upon returning to user space, it call the requester's callback
with the timestamp as well as the stacktrace.

Co-developed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/unwind_deferred.h       |  18 ++++
 include/linux/unwind_deferred_types.h |   3 +
 kernel/unwind/deferred.c              | 131 +++++++++++++++++++++++++-
 3 files changed, 151 insertions(+), 1 deletion(-)

diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index 7d6cb2ffd084..a384eef719a3 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -2,9 +2,19 @@
 #ifndef _LINUX_UNWIND_USER_DEFERRED_H
 #define _LINUX_UNWIND_USER_DEFERRED_H
 
+#include <linux/task_work.h>
 #include <linux/unwind_user.h>
 #include <linux/unwind_deferred_types.h>
 
+struct unwind_work;
+
+typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 timestamp);
+
+struct unwind_work {
+	struct list_head		list;
+	unwind_callback_t		func;
+};
+
 #ifdef CONFIG_UNWIND_USER
 
 void unwind_task_init(struct task_struct *task);
@@ -12,10 +22,15 @@ void unwind_task_free(struct task_struct *task);
 
 int unwind_deferred_trace(struct unwind_stacktrace *trace);
 
+int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func);
+int unwind_deferred_request(struct unwind_work *work, u64 *timestamp);
+void unwind_deferred_cancel(struct unwind_work *work);
+
 static __always_inline void unwind_exit_to_user_mode(void)
 {
 	if (unlikely(current->unwind_info.cache))
 		current->unwind_info.cache->nr_entries = 0;
+	current->unwind_info.timestamp = 0;
 }
 
 #else /* !CONFIG_UNWIND_USER */
@@ -24,6 +39,9 @@ static inline void unwind_task_init(struct task_struct *task) {}
 static inline void unwind_task_free(struct task_struct *task) {}
 
 static inline int unwind_deferred_trace(struct unwind_stacktrace *trace) { return -ENOSYS; }
+static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; }
+static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; }
+static inline void unwind_deferred_cancel(struct unwind_work *work) {}
 
 static inline void unwind_exit_to_user_mode(void) {}
 
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index db5b54b18828..5df264cf81ad 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -9,6 +9,9 @@ struct unwind_cache {
 
 struct unwind_task_info {
 	struct unwind_cache	*cache;
+	struct callback_head	work;
+	u64			timestamp;
+	int			pending;
 };
 
 #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index e3913781c8c6..b76c704ddc6d 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -2,13 +2,35 @@
 /*
  * Deferred user space unwinding
  */
+#include <linux/sched/task_stack.h>
+#include <linux/unwind_deferred.h>
+#include <linux/sched/clock.h>
+#include <linux/task_work.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <linux/unwind_deferred.h>
+#include <linux/mm.h>
 
 #define UNWIND_MAX_ENTRIES 512
 
+/* Guards adding to and reading the list of callbacks */
+static DEFINE_MUTEX(callback_mutex);
+static LIST_HEAD(callbacks);
+
+/*
+ * Read the task context timestamp, if this is the first caller then
+ * it will set the timestamp.
+ */
+static u64 get_timestamp(struct unwind_task_info *info)
+{
+	lockdep_assert_irqs_disabled();
+
+	if (!info->timestamp)
+		info->timestamp = local_clock();
+
+	return info->timestamp;
+}
+
 /**
  * unwind_deferred_trace - Produce a user stacktrace in faultable context
  * @trace: The descriptor that will store the user stacktrace
@@ -59,11 +81,117 @@ int unwind_deferred_trace(struct unwind_stacktrace *trace)
 	return 0;
 }
 
+static void unwind_deferred_task_work(struct callback_head *head)
+{
+	struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
+	struct unwind_stacktrace trace;
+	struct unwind_work *work;
+	u64 timestamp;
+
+	if (WARN_ON_ONCE(!info->pending))
+		return;
+
+	/* Allow work to come in again */
+	WRITE_ONCE(info->pending, 0);
+
+	/*
+	 * From here on out, the callback must always be called, even if it's
+	 * just an empty trace.
+	 */
+	trace.nr = 0;
+	trace.entries = NULL;
+
+	unwind_deferred_trace(&trace);
+
+	timestamp = info->timestamp;
+
+	guard(mutex)(&callback_mutex);
+	list_for_each_entry(work, &callbacks, list) {
+		work->func(work, &trace, timestamp);
+	}
+}
+
+/**
+ * unwind_deferred_request - Request a user stacktrace on task exit
+ * @work: Unwind descriptor requesting the trace
+ * @timestamp: The time stamp of the first request made for this task
+ *
+ * Schedule a user space unwind to be done in task work before exiting the
+ * kernel.
+ *
+ * The returned @timestamp output is the timestamp of the very first request
+ * for a user space stacktrace for this task since it entered the kernel.
+ * It can be from a request by any caller of this infrastructure.
+ * Its value will also be passed to the callback function.  It can be
+ * used to stitch kernel and user stack traces together in post-processing.
+ *
+ * It's valid to call this function multiple times for the same @work within
+ * the same task entry context.  Each call will return the same timestamp
+ * while the task hasn't left the kernel. If the callback is not pending because
+ * it has already been previously called for the same entry context, it will be
+ * called again with the same stack trace and timestamp.
+ *
+ * Return: 1 if the the callback was already queued.
+ *         0 if the callback successfully was queued.
+ *         Negative if there's an error.
+ *         @timestamp holds the timestamp of the first request by any user
+ */
+int unwind_deferred_request(struct unwind_work *work, u64 *timestamp)
+{
+	struct unwind_task_info *info = &current->unwind_info;
+	int ret;
+
+	*timestamp = 0;
+
+	if (WARN_ON_ONCE(in_nmi()))
+		return -EINVAL;
+
+	if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
+	    !user_mode(task_pt_regs(current)))
+		return -EINVAL;
+
+	guard(irqsave)();
+
+	*timestamp = get_timestamp(info);
+
+	/* callback already pending? */
+	if (info->pending)
+		return 1;
+
+	/* The work has been claimed, now schedule it. */
+	ret = task_work_add(current, &info->work, TWA_RESUME);
+	if (WARN_ON_ONCE(ret))
+		return ret;
+
+	info->pending = 1;
+	return 0;
+}
+
+void unwind_deferred_cancel(struct unwind_work *work)
+{
+	if (!work)
+		return;
+
+	guard(mutex)(&callback_mutex);
+	list_del(&work->list);
+}
+
+int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
+{
+	memset(work, 0, sizeof(*work));
+
+	guard(mutex)(&callback_mutex);
+	list_add(&work->list, &callbacks);
+	work->func = func;
+	return 0;
+}
+
 void unwind_task_init(struct task_struct *task)
 {
 	struct unwind_task_info *info = &task->unwind_info;
 
 	memset(info, 0, sizeof(*info));
+	init_task_work(&info->work, unwind_deferred_task_work);
 }
 
 void unwind_task_free(struct task_struct *task)
@@ -71,4 +199,5 @@ void unwind_task_free(struct task_struct *task)
 	struct unwind_task_info *info = &task->unwind_info;
 
 	kfree(info->cache);
+	task_work_cancel(task, &info->work);
 }
-- 
2.47.2

Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Peter Zijlstra 3 months, 3 weeks ago

> +struct unwind_work;
> +
> +typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 timestamp);
> +
> +struct unwind_work {
> +	struct list_head		list;

Does this really need to be a list? Single linked list like
callback_head not good enough?

> +	unwind_callback_t		func;
> +};
> +
>  #ifdef CONFIG_UNWIND_USER
>  
>  void unwind_task_init(struct task_struct *task);
> @@ -12,10 +22,15 @@ void unwind_task_free(struct task_struct *task);
>  
>  int unwind_deferred_trace(struct unwind_stacktrace *trace);
>  
> +int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func);
> +int unwind_deferred_request(struct unwind_work *work, u64 *timestamp);
> +void unwind_deferred_cancel(struct unwind_work *work);
> +
>  static __always_inline void unwind_exit_to_user_mode(void)
>  {
>  	if (unlikely(current->unwind_info.cache))
>  		current->unwind_info.cache->nr_entries = 0;
> +	current->unwind_info.timestamp = 0;

Surely clearing that timestamp is only relevant when there is a cache
around? Better to not add this unconditional write to the exit path.

>  }
>  
>  #else /* !CONFIG_UNWIND_USER */
> @@ -24,6 +39,9 @@ static inline void unwind_task_init(struct task_struct *task) {}
>  static inline void unwind_task_free(struct task_struct *task) {}
>  
>  static inline int unwind_deferred_trace(struct unwind_stacktrace *trace) { return -ENOSYS; }
> +static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; }
> +static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; }
> +static inline void unwind_deferred_cancel(struct unwind_work *work) {}
>  
>  static inline void unwind_exit_to_user_mode(void) {}
>  
> diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
> index db5b54b18828..5df264cf81ad 100644
> --- a/include/linux/unwind_deferred_types.h
> +++ b/include/linux/unwind_deferred_types.h
> @@ -9,6 +9,9 @@ struct unwind_cache {
>  
>  struct unwind_task_info {
>  	struct unwind_cache	*cache;
> +	struct callback_head	work;
> +	u64			timestamp;
> +	int			pending;
>  };
>  
>  #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
> diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
> index e3913781c8c6..b76c704ddc6d 100644
> --- a/kernel/unwind/deferred.c
> +++ b/kernel/unwind/deferred.c
> @@ -2,13 +2,35 @@
>  /*
>   * Deferred user space unwinding
>   */
> +#include <linux/sched/task_stack.h>
> +#include <linux/unwind_deferred.h>
> +#include <linux/sched/clock.h>
> +#include <linux/task_work.h>
>  #include <linux/kernel.h>
>  #include <linux/sched.h>
>  #include <linux/slab.h>
> -#include <linux/unwind_deferred.h>
> +#include <linux/mm.h>
>  
>  #define UNWIND_MAX_ENTRIES 512
>  
> +/* Guards adding to and reading the list of callbacks */
> +static DEFINE_MUTEX(callback_mutex);
> +static LIST_HEAD(callbacks);

Global state.. smells like failure.

> +/*
> + * Read the task context timestamp, if this is the first caller then
> + * it will set the timestamp.
> + */
> +static u64 get_timestamp(struct unwind_task_info *info)
> +{
> +	lockdep_assert_irqs_disabled();
> +
> +	if (!info->timestamp)
> +		info->timestamp = local_clock();
> +
> +	return info->timestamp;
> +}
> +
>  /**
>   * unwind_deferred_trace - Produce a user stacktrace in faultable context
>   * @trace: The descriptor that will store the user stacktrace
> @@ -59,11 +81,117 @@ int unwind_deferred_trace(struct unwind_stacktrace *trace)
>  	return 0;
>  }
>  
> +static void unwind_deferred_task_work(struct callback_head *head)
> +{
> +	struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
> +	struct unwind_stacktrace trace;
> +	struct unwind_work *work;
> +	u64 timestamp;
> +
> +	if (WARN_ON_ONCE(!info->pending))
> +		return;
> +
> +	/* Allow work to come in again */
> +	WRITE_ONCE(info->pending, 0);
> +
> +	/*
> +	 * From here on out, the callback must always be called, even if it's
> +	 * just an empty trace.
> +	 */
> +	trace.nr = 0;
> +	trace.entries = NULL;
> +
> +	unwind_deferred_trace(&trace);
> +
> +	timestamp = info->timestamp;
> +
> +	guard(mutex)(&callback_mutex);
> +	list_for_each_entry(work, &callbacks, list) {
> +		work->func(work, &trace, timestamp);
> +	}

So now you're globally serializing all return-to-user instances. How is
that not a problem?

> +}

Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Steven Rostedt 3 months, 2 weeks ago

On Wed, 18 Jun 2025 20:46:20 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> >  static __always_inline void unwind_exit_to_user_mode(void)
> >  {
> >  	if (unlikely(current->unwind_info.cache))
> >  		current->unwind_info.cache->nr_entries = 0;
> > +	current->unwind_info.timestamp = 0;  
> 
> Surely clearing that timestamp is only relevant when there is a cache
> around? Better to not add this unconditional write to the exit path.
> 
> >  }

Note, the timestamp could be there if the cache failed to allocate, and
we would still want to clear the timestamp. I did turn this into:

static __always_inline void unwind_reset_info(void)
{
	/* Exit out early if this was never used */
	if (likely(!current->unwind_info.timestamp))
		return;

	if (current->unwind_info.cache)
		current->unwind_info.cache->nr_entries = 0;
	current->unwind_info.timestamp = 0;
}

For this patch, but later patches will give us the bitmask to check.

-- Steve

Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Steven Rostedt 3 months, 3 weeks ago

On Wed, 18 Jun 2025 20:46:20 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> > +struct unwind_work;
> > +
> > +typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 timestamp);
> > +
> > +struct unwind_work {
> > +	struct list_head		list;  
> 
> Does this really need to be a list? Single linked list like
> callback_head not good enough?

Doesn't a list head make it easier to remove without having to iterate the
list?

> 
> > +	unwind_callback_t		func;
> > +};
> > +
> >  #ifdef CONFIG_UNWIND_USER
> >  
> >  void unwind_task_init(struct task_struct *task);
> > @@ -12,10 +22,15 @@ void unwind_task_free(struct task_struct *task);
> >  
> >  int unwind_deferred_trace(struct unwind_stacktrace *trace);
> >  
> > +int unwind_deferred_init(struct unwind_work *work, unwind_callback_t
> > func); +int unwind_deferred_request(struct unwind_work *work, u64
> > *timestamp); +void unwind_deferred_cancel(struct unwind_work *work);
> > +
> >  static __always_inline void unwind_exit_to_user_mode(void)
> >  {
> >  	if (unlikely(current->unwind_info.cache))
> >  		current->unwind_info.cache->nr_entries = 0;
> > +	current->unwind_info.timestamp = 0;  
> 
> Surely clearing that timestamp is only relevant when there is a cache
> around? Better to not add this unconditional write to the exit path.

That's actually not quite true. If the allocation fails, we still want to
clear the timestamp. But later patches add more data to check and it does
exit out if there's been no requests:

{
        struct unwind_task_info *info = &current->unwind_info;
        unsigned long bits;

        /* Was there any unwinding? */
        if (likely(!info->unwind_mask))
                return;

        bits = info->unwind_mask;
        do {
                /* Is a task_work going to run again before going back */
                if (bits & UNWIND_PENDING)
                        return;
        } while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL));

        if (likely(info->cache))
                info->cache->nr_entries = 0;
        info->timestamp = 0;
}

But for better reviewing, I could add a comment in this patch that states
that this will eventually exit out early when it does more work.


> 
> >  }
> >  
> >  #else /* !CONFIG_UNWIND_USER */
> > @@ -24,6 +39,9 @@ static inline void unwind_task_init(struct
> > task_struct *task) {} static inline void unwind_task_free(struct
> > task_struct *task) {} 
> >  static inline int unwind_deferred_trace(struct unwind_stacktrace
> > *trace) { return -ENOSYS; } +static inline int
> > unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
> > { return -ENOSYS; } +static inline int unwind_deferred_request(struct
> > unwind_work *work, u64 *timestamp) { return -ENOSYS; } +static inline
> > void unwind_deferred_cancel(struct unwind_work *work) {} static inline
> > void unwind_exit_to_user_mode(void) {} 
> > diff --git a/include/linux/unwind_deferred_types.h
> > b/include/linux/unwind_deferred_types.h index
> > db5b54b18828..5df264cf81ad 100644 ---
> > a/include/linux/unwind_deferred_types.h +++
> > b/include/linux/unwind_deferred_types.h @@ -9,6 +9,9 @@ struct
> > unwind_cache { 
> >  struct unwind_task_info {
> >  	struct unwind_cache	*cache;
> > +	struct callback_head	work;
> > +	u64			timestamp;
> > +	int			pending;
> >  };
> >  
> >  #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
> > diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
> > index e3913781c8c6..b76c704ddc6d 100644
> > --- a/kernel/unwind/deferred.c
> > +++ b/kernel/unwind/deferred.c
> > @@ -2,13 +2,35 @@
> >  /*
> >   * Deferred user space unwinding
> >   */
> > +#include <linux/sched/task_stack.h>
> > +#include <linux/unwind_deferred.h>
> > +#include <linux/sched/clock.h>
> > +#include <linux/task_work.h>
> >  #include <linux/kernel.h>
> >  #include <linux/sched.h>
> >  #include <linux/slab.h>
> > -#include <linux/unwind_deferred.h>
> > +#include <linux/mm.h>
> >  
> >  #define UNWIND_MAX_ENTRIES 512
> >  
> > +/* Guards adding to and reading the list of callbacks */
> > +static DEFINE_MUTEX(callback_mutex);
> > +static LIST_HEAD(callbacks);  
> 
> Global state.. smells like failure.

Yes, the unwind infrastructure is global, as it is the way tasks know what
tracer's callbacks to call.

> 
> > +/*
> > + * Read the task context timestamp, if this is the first caller then
> > + * it will set the timestamp.
> > + */
> > +static u64 get_timestamp(struct unwind_task_info *info)
> > +{
> > +	lockdep_assert_irqs_disabled();
> > +
> > +	if (!info->timestamp)
> > +		info->timestamp = local_clock();
> > +
> > +	return info->timestamp;
> > +}
> > +
> >  /**
> >   * unwind_deferred_trace - Produce a user stacktrace in faultable
> > context
> >   * @trace: The descriptor that will store the user stacktrace
> > @@ -59,11 +81,117 @@ int unwind_deferred_trace(struct unwind_stacktrace
> > *trace) return 0;
> >  }
> >  
> > +static void unwind_deferred_task_work(struct callback_head *head)
> > +{
> > +	struct unwind_task_info *info = container_of(head, struct
> > unwind_task_info, work);
> > +	struct unwind_stacktrace trace;
> > +	struct unwind_work *work;
> > +	u64 timestamp;
> > +
> > +	if (WARN_ON_ONCE(!info->pending))
> > +		return;
> > +
> > +	/* Allow work to come in again */
> > +	WRITE_ONCE(info->pending, 0);
> > +
> > +	/*
> > +	 * From here on out, the callback must always be called, even
> > if it's
> > +	 * just an empty trace.
> > +	 */
> > +	trace.nr = 0;
> > +	trace.entries = NULL;
> > +
> > +	unwind_deferred_trace(&trace);
> > +
> > +	timestamp = info->timestamp;
> > +
> > +	guard(mutex)(&callback_mutex);
> > +	list_for_each_entry(work, &callbacks, list) {
> > +		work->func(work, &trace, timestamp);
> > +	}  
> 
> So now you're globally serializing all return-to-user instances. How is
> that not a problem?

It was the original way we did things. The next patch changes this to SRCU.
But it requires a bit more care. For breaking up the series, I preferred
not to add that logic and make it a separate patch.

For better reviewing, I'll add a comment here that says:

	/* TODO switch this global lock to SRCU */

-- Steve

Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Peter Zijlstra 3 months, 3 weeks ago

On Wed, Jun 18, 2025 at 03:09:15PM -0400, Steven Rostedt wrote:
> On Wed, 18 Jun 2025 20:46:20 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > > +struct unwind_work;
> > > +
> > > +typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 timestamp);
> > > +
> > > +struct unwind_work {
> > > +	struct list_head		list;  
> > 
> > Does this really need to be a list? Single linked list like
> > callback_head not good enough?
> 
> Doesn't a list head make it easier to remove without having to iterate the
> list?

Yeah, but why would you ever want to remove it? You asked for an unwind,
you get an unwind, no?

> > >  static __always_inline void unwind_exit_to_user_mode(void)
> > >  {
> > >  	if (unlikely(current->unwind_info.cache))
> > >  		current->unwind_info.cache->nr_entries = 0;
> > > +	current->unwind_info.timestamp = 0;  
> > 
> > Surely clearing that timestamp is only relevant when there is a cache
> > around? Better to not add this unconditional write to the exit path.
> 
> That's actually not quite true. If the allocation fails, we still want to
> clear the timestamp. But later patches add more data to check and it does
> exit out if there's been no requests:

Well, you could put in an error value on alloc fail or somesuch. Then
its non-zero.

> But for better reviewing, I could add a comment in this patch that states
> that this will eventually exit out early when it does more work.

You're making this really hard to review, why not do it right from the
get-go?

> > > +/* Guards adding to and reading the list of callbacks */
> > > +static DEFINE_MUTEX(callback_mutex);
> > > +static LIST_HEAD(callbacks);  
> > 
> > Global state.. smells like failure.
> 
> Yes, the unwind infrastructure is global, as it is the way tasks know what
> tracer's callbacks to call.

Well, that's apparently how you've set it up. I don't immediately see
this has to be like this.

And there's no comments no nothing.

I don't see why you can't have something like:

struct unwind_work {
	struct callback_head task_work;
	void *data;
	void (*func)(struct unwind_work *work, void *data);
};

void unwind_task_work_func(struct callback_head *task_work)
{
	struct unwind_work *uw = container_of(task_work, struct unwind_work, task_work);

	// do actual unwind

	uw->func(uw, uw->data);
}

or something along those lines. No global state involved.


> > > +	guard(mutex)(&callback_mutex);
> > > +	list_for_each_entry(work, &callbacks, list) {
> > > +		work->func(work, &trace, timestamp);
> > > +	}  
> > 
> > So now you're globally serializing all return-to-user instances. How is
> > that not a problem?
> 
> It was the original way we did things. The next patch changes this to SRCU.
> But it requires a bit more care. For breaking up the series, I preferred
> not to add that logic and make it a separate patch.
> 
> For better reviewing, I'll add a comment here that says:
> 
> 	/* TODO switch this global lock to SRCU */

Oh ffs :-(

So splitting up patches is for ease of review, but now you're making
splits that make review harder, how does that make sense?

Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Steven Rostedt 3 months, 3 weeks ago

On Thu, 19 Jun 2025 09:50:08 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Jun 18, 2025 at 03:09:15PM -0400, Steven Rostedt wrote:
> > On Wed, 18 Jun 2025 20:46:20 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
> >   
> > > > +struct unwind_work;
> > > > +
> > > > +typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 timestamp);
> > > > +
> > > > +struct unwind_work {
> > > > +	struct list_head		list;    
> > > 
> > > Does this really need to be a list? Single linked list like
> > > callback_head not good enough?  
> > 
> > Doesn't a list head make it easier to remove without having to iterate the
> > list?  
> 
> Yeah, but why would you ever want to remove it? You asked for an unwind,
> you get an unwind, no?

No, it's not unique per tracing infrastructure, but tracing instance.
That is, per perf program, or per tracing instance. It needs to be
removed.

> 
> > > >  static __always_inline void unwind_exit_to_user_mode(void)
> > > >  {
> > > >  	if (unlikely(current->unwind_info.cache))
> > > >  		current->unwind_info.cache->nr_entries = 0;
> > > > +	current->unwind_info.timestamp = 0;    
> > > 
> > > Surely clearing that timestamp is only relevant when there is a cache
> > > around? Better to not add this unconditional write to the exit path.  
> > 
> > That's actually not quite true. If the allocation fails, we still want to
> > clear the timestamp. But later patches add more data to check and it does
> > exit out if there's been no requests:  
> 
> Well, you could put in an error value on alloc fail or somesuch. Then
> its non-zero.

OK.

> 
> > But for better reviewing, I could add a comment in this patch that states
> > that this will eventually exit out early when it does more work.  
> 
> You're making this really hard to review, why not do it right from the
> get-go?

Because the value that is to be checked isn't here yet.

> 
> > > > +/* Guards adding to and reading the list of callbacks */
> > > > +static DEFINE_MUTEX(callback_mutex);
> > > > +static LIST_HEAD(callbacks);    
> > > 
> > > Global state.. smells like failure.  
> > 
> > Yes, the unwind infrastructure is global, as it is the way tasks know what
> > tracer's callbacks to call.  
> 
> Well, that's apparently how you've set it up. I don't immediately see
> this has to be like this.
> 
> And there's no comments no nothing.
> 
> I don't see why you can't have something like:
> 
> struct unwind_work {
> 	struct callback_head task_work;
> 	void *data;
> 	void (*func)(struct unwind_work *work, void *data);
> };
> 
> void unwind_task_work_func(struct callback_head *task_work)
> {
> 	struct unwind_work *uw = container_of(task_work, struct unwind_work, task_work);
> 
> 	// do actual unwind
> 
> 	uw->func(uw, uw->data);
> }
> 
> or something along those lines. No global state involved.

We have a many to many relationship here where a task_work doesn't work.

That is, you can have a tracer that expects callbacks from several
tasks at the same time, as well as some of those tasks expect to send a
callback to different tracers.

Later patches add a bitmask to every task that gets set to know which
trace to use.

Since the number of tracers that can be called back is fixed to the
number of bits in long (for the bitmask), I can get rid of the link
list and make it into an array. That would make this easier.


> 
> 
> > > > +	guard(mutex)(&callback_mutex);
> > > > +	list_for_each_entry(work, &callbacks, list) {
> > > > +		work->func(work, &trace, timestamp);
> > > > +	}    
> > > 
> > > So now you're globally serializing all return-to-user instances. How is
> > > that not a problem?  
> > 
> > It was the original way we did things. The next patch changes this to SRCU.
> > But it requires a bit more care. For breaking up the series, I preferred
> > not to add that logic and make it a separate patch.
> > 
> > For better reviewing, I'll add a comment here that says:
> > 
> > 	/* TODO switch this global lock to SRCU */  
> 
> Oh ffs :-(
> 
> So splitting up patches is for ease of review, but now you're making
> splits that make review harder, how does that make sense?

Actually, a comment isn't the right place, I should have mentioned this
in the change log.

-- Steve

Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Peter Zijlstra 3 months, 3 weeks ago

On Thu, Jun 19, 2025 at 04:56:59AM -0400, Steven Rostedt wrote:

> We have a many to many relationship here where a task_work doesn't work.
> 
> That is, you can have a tracer that expects callbacks from several
> tasks at the same time, as well as some of those tasks expect to send a
> callback to different tracers.
> 
> Later patches add a bitmask to every task that gets set to know which
> trace to use.
> 
> Since the number of tracers that can be called back is fixed to the
> number of bits in long (for the bitmask), I can get rid of the link
> list and make it into an array. That would make this easier.

So something sketching this design decision might be useful. Perhaps a
comment in the file itself?

I feel much of this complication stems from the fact you're wanting to
make this perhaps too generic.

Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Steven Rostedt 3 months, 2 weeks ago

On Thu, 19 Jun 2025 11:11:21 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> I feel much of this complication stems from the fact you're wanting to
> make this perhaps too generic.

I want to make it work for perf, ftrace, LTTng and BPF, where each has
their own requirements, which tends to force making it generic. :-/

-- Steve

Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Steven Rostedt 3 months, 3 weeks ago

On Wed, 18 Jun 2025 15:09:15 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> > So now you're globally serializing all return-to-user instances. How is
> > that not a problem?  
> 
> It was the original way we did things. The next patch changes this to SRCU.
> But it requires a bit more care. For breaking up the series, I preferred
> not to add that logic and make it a separate patch.

It's not the next patch but patch 9 (three away).

-- Steve

Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Peter Zijlstra 3 months, 3 weeks ago

On Tue, Jun 10, 2025 at 08:54:27PM -0400, Steven Rostedt wrote:
> From: Josh Poimboeuf <jpoimboe@kernel.org>
> 
> Add an interface for scheduling task work to unwind the user space stack
> before returning to user space. This solves several problems for its
> callers:
> 
>   - Ensure the unwind happens in task context even if the caller may be
>     running in interrupt context.
> 
>   - Avoid duplicate unwinds, whether called multiple times by the same
>     caller or by different callers.
> 
>   - Take a timestamp when the first request comes in since the task
>     entered the kernel. This will be returned to the calling function
>     along with the stack trace when the task leaves the kernel. This
>     timestamp can be used to correlate kernel unwinds/traces with the user
>     unwind.
> 
> The timestamp is created to detect when the stacktrace is the same. It is
> generated the first time a user space stacktrace is requested after the
> task enters the kernel.
> 
> The timestamp is passed to the caller on request, and when the stacktrace is
> generated upon returning to user space, it call the requester's callback
> with the timestamp as well as the stacktrace.

This whole story hinges on there being a high resolution time-stamp
available... Good thing we killed x86 !TSC support when we did. You sure
there's no other architectures you're interested in that lack a high res
time source?

What about two CPUs managing to request an unwind at exactly the same
time?

Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Steven Rostedt 3 months, 3 weeks ago

On Wed, 18 Jun 2025 16:20:00 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> > The timestamp is passed to the caller on request, and when the stacktrace is
> > generated upon returning to user space, it call the requester's callback
> > with the timestamp as well as the stacktrace.  
> 
> This whole story hinges on there being a high resolution time-stamp
> available... Good thing we killed x86 !TSC support when we did. You sure
> there's no other architectures you're interested in that lack a high res
> time source?
> 
> What about two CPUs managing to request an unwind at exactly the same
> time?

It's mapped to a task. As long as each timestamp is unique for a task it
should be fine. As the trace can record the current->pid along with the
timestamp to map to the unique user space stack trace.

As for resolution, as long as there can't be two system calls back to back
within the same time stamp. Otherwise, yeah, we have an issue.

-- Steve

Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Peter Zijlstra 3 months, 3 weeks ago

On Wed, Jun 18, 2025 at 11:37:06AM -0400, Steven Rostedt wrote:
> On Wed, 18 Jun 2025 16:20:00 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > > The timestamp is passed to the caller on request, and when the stacktrace is
> > > generated upon returning to user space, it call the requester's callback
> > > with the timestamp as well as the stacktrace.  
> > 
> > This whole story hinges on there being a high resolution time-stamp
> > available... Good thing we killed x86 !TSC support when we did. You sure
> > there's no other architectures you're interested in that lack a high res
> > time source?
> > 
> > What about two CPUs managing to request an unwind at exactly the same
> > time?
> 
> It's mapped to a task. As long as each timestamp is unique for a task it
> should be fine. As the trace can record the current->pid along with the
> timestamp to map to the unique user space stack trace.
> 
> As for resolution, as long as there can't be two system calls back to back
> within the same time stamp. Otherwise, yeah, we have an issue.

Well, up until very recent, jiffies was the fallback clock on x86. You
can get PID reuse in a jiffy if you push things hard.

Most archs seem to have some higher res clock these days, but I would
not be surprised if among the museum pieces we still support there's
some crap lying in wait.

If you want to rely on consecutive system calls never seeing the same
timestamp, let alone PID reuse in the same timestamp -- for some generic
infrastructure -- you need to go audit all the arch code.

Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Steven Rostedt 3 months, 3 weeks ago

On Thu, 19 Jun 2025 10:01:08 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> If you want to rely on consecutive system calls never seeing the same
> timestamp, let alone PID reuse in the same timestamp -- for some generic
> infrastructure -- you need to go audit all the arch code.

I can drop the timestamp and go back to the original "cookie"
generation that guaranteed unique ids for something like 2^48 system
calls per task.

I thought the timestamps just made it easier. But having to audit every
arch, may make that not the case.

-- Steve

Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface

Posted by Steven Rostedt 3 months, 3 weeks ago

On Wed, 18 Jun 2025 11:37:06 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> > What about two CPUs managing to request an unwind at exactly the same
> > time?  
> 
> It's mapped to a task. As long as each timestamp is unique for a task it
> should be fine. As the trace can record the current->pid along with the
> timestamp to map to the unique user space stack trace.
> 
> As for resolution, as long as there can't be two system calls back to back
> within the same time stamp. Otherwise, yeah, we have an issue.

I'll add a comment that states this as a constraint.

-- Steve