unwind, perf: sframe user space unwinding

[PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Josh Poimboeuf 3 weeks, 6 days ago

Add unwind_user_deferred() which allows callers to schedule task work to
unwind the user space stack before returning to user space.  This solves
several problems for its callers:

  - Ensure the unwind happens in task context even if the caller may
    running in interrupt context.

  - Only do the unwind once, even if called multiple times either by the
    same caller or multiple callers.

  - Create a "context context" cookie which allows trace post-processing
    to correlate kernel unwinds/traces with the user unwind.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/linux/entry-common.h |   3 +
 include/linux/sched.h        |   5 +
 include/linux/unwind_user.h  |  56 ++++++++++
 kernel/fork.c                |   4 +
 kernel/unwind/user.c         | 199 +++++++++++++++++++++++++++++++++++
 5 files changed, 267 insertions(+)

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 1e50cdb83ae5..efbe8f964f31 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -12,6 +12,7 @@
 #include <linux/resume_user_mode.h>
 #include <linux/tick.h>
 #include <linux/kmsan.h>
+#include <linux/unwind_user.h>
 
 #include <asm/entry-common.h>
 
@@ -111,6 +112,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
 	CT_WARN_ON(__ct_state() != CT_STATE_USER);
 	user_exit_irqoff();
 
+	unwind_enter_from_user_mode();
+
 	instrumentation_begin();
 	kmsan_unpoison_entry_regs(regs);
 	trace_hardirqs_off_finish();
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5007a8e2d640..31b6f1d763ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -47,6 +47,7 @@
 #include <linux/livepatch_sched.h>
 #include <linux/uidgid_types.h>
 #include <asm/kmap_size.h>
+#include <linux/unwind_user.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
 struct audit_context;
@@ -1592,6 +1593,10 @@ struct task_struct {
 	struct user_event_mm		*user_event_mm;
 #endif
 
+#ifdef CONFIG_UNWIND_USER
+	struct unwind_task_info		unwind_task_info;
+#endif
+
 	/*
 	 * New fields for task_struct should be added above here, so that
 	 * they are included in the randomized portion of task_struct.
diff --git a/include/linux/unwind_user.h b/include/linux/unwind_user.h
index cde0fde4923e..98e236c843b1 100644
--- a/include/linux/unwind_user.h
+++ b/include/linux/unwind_user.h
@@ -3,6 +3,9 @@
 #define _LINUX_UNWIND_USER_H
 
 #include <linux/types.h>
+#include <linux/percpu-defs.h>
+
+#define UNWIND_MAX_CALLBACKS 4
 
 enum unwind_user_type {
 	UNWIND_USER_TYPE_NONE,
@@ -30,6 +33,26 @@ struct unwind_user_state {
 	bool done;
 };
 
+struct unwind_task_info {
+	u64			ctx_cookie;
+	u32			pending_callbacks;
+	u64			last_cookies[UNWIND_MAX_CALLBACKS];
+	void			*privs[UNWIND_MAX_CALLBACKS];
+	unsigned long		*entries;
+	struct callback_head	work;
+};
+
+typedef void (*unwind_callback_t)(struct unwind_stacktrace *trace,
+				  u64 ctx_cookie, void *data);
+
+struct unwind_callback {
+	unwind_callback_t		func;
+	int				idx;
+};
+
+
+#ifdef CONFIG_UNWIND_USER
+
 /* Synchronous interfaces: */
 
 int unwind_user_start(struct unwind_user_state *state);
@@ -40,4 +63,37 @@ int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries);
 #define for_each_user_frame(state) \
 	for (unwind_user_start((state)); !(state)->done; unwind_user_next((state)))
 
+
+/* Asynchronous interfaces: */
+
+void unwind_task_init(struct task_struct *task);
+void unwind_task_free(struct task_struct *task);
+
+int unwind_user_register(struct unwind_callback *callback, unwind_callback_t func);
+int unwind_user_unregister(struct unwind_callback *callback);
+
+int unwind_user_deferred(struct unwind_callback *callback, u64 *ctx_cookie, void *data);
+
+DECLARE_PER_CPU(u64, unwind_ctx_ctr);
+
+static __always_inline void unwind_enter_from_user_mode(void)
+{
+	__this_cpu_inc(unwind_ctx_ctr);
+}
+
+
+#else /* !CONFIG_UNWIND_USER */
+
+static inline void unwind_task_init(struct task_struct *task) {}
+static inline void unwind_task_free(struct task_struct *task) {}
+
+static inline int unwind_user_register(struct unwind_callback *callback, unwind_callback_t func) { return -ENOSYS; }
+static inline int unwind_user_unregister(struct unwind_callback *callback) { return -ENOSYS; }
+
+static inline int unwind_user_deferred(struct unwind_callback *callback, u64 *ctx_cookie, void *data) { return -ENOSYS; }
+
+static inline void unwind_enter_from_user_mode(void) {}
+
+#endif /* !CONFIG_UNWIND_USER */
+
 #endif /* _LINUX_UNWIND_USER_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 60f14fbab956..d7580067853d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,6 +105,7 @@
 #include <linux/rseq.h>
 #include <uapi/linux/pidfd.h>
 #include <linux/pidfs.h>
+#include <linux/unwind_user.h>
 #include <linux/sframe.h>
 
 #include <asm/pgalloc.h>
@@ -972,6 +973,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(refcount_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	unwind_task_free(tsk);
 	sched_ext_free(tsk);
 	io_uring_free(tsk);
 	cgroup_free(tsk);
@@ -2348,6 +2350,8 @@ __latent_entropy struct task_struct *copy_process(
 	p->bpf_ctx = NULL;
 #endif
 
+	unwind_task_init(p);
+
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
 	if (retval)
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 8e47c80e3e54..ed7759c56551 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -10,6 +10,11 @@
 #include <linux/unwind_user.h>
 #include <linux/sframe.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/task_work.h>
+#include <linux/mm.h>
+
+#define UNWIND_MAX_ENTRIES 512
 
 #ifdef CONFIG_HAVE_UNWIND_USER_FP
 #include <asm/unwind_user.h>
@@ -20,6 +25,12 @@ static struct unwind_user_frame fp_frame = {
 static struct unwind_user_frame fp_frame;
 #endif
 
+static struct unwind_callback *callbacks[UNWIND_MAX_CALLBACKS];
+static DECLARE_RWSEM(callbacks_rwsem);
+
+/* Counter for entries from user space */
+DEFINE_PER_CPU(u64, unwind_ctx_ctr);
+
 int unwind_user_next(struct unwind_user_state *state)
 {
 	struct unwind_user_frame _frame;
@@ -117,3 +128,191 @@ int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries)
 
 	return 0;
 }
+
+/*
+ * The "context cookie" is a unique identifier which allows post-processing to
+ * correlate kernel trace(s) with user unwinds.  It has the CPU id the highest
+ * 16 bits and a per-CPU entry counter in the lower 48 bits.
+ */
+static u64 ctx_to_cookie(u64 cpu, u64 ctx)
+{
+	BUILD_BUG_ON(NR_CPUS > 65535);
+	return (ctx & ((1UL << 48) - 1)) | cpu;
+}
+
+/*
+ * Schedule a user space unwind to be done in task work before exiting the
+ * kernel.
+ *
+ * The @callback must have previously been registered with
+ * unwind_user_register().
+ *
+ * The @cookie output is a unique identifer which will also be passed to the
+ * callback function.  It can be used to stitch kernel and user traces together
+ * in post-processing.
+ *
+ * If there are multiple calls to this function for a given @callback, the
+ * cookie will usually be the same and the callback will only be called once.
+ *
+ * The only exception is when the task has migrated to another CPU, *and* this
+ * is called while the task work is running (or has already run).  Then a new
+ * cookie will be generated and the callback will be called again for the new
+ * cookie.
+ */
+int unwind_user_deferred(struct unwind_callback *callback, u64 *ctx_cookie, void *data)
+{
+	struct unwind_task_info *info = &current->unwind_task_info;
+	u64 cookie = info->ctx_cookie;
+	int idx = callback->idx;
+
+	if (WARN_ON_ONCE(in_nmi()))
+		return -EINVAL;
+
+	if (WARN_ON_ONCE(!callback->func || idx < 0))
+		return -EINVAL;
+
+	if (!current->mm)
+		return -EINVAL;
+
+	guard(irqsave)();
+
+	if (cookie && (info->pending_callbacks & (1 << idx)))
+		goto done;
+
+	/*
+	 * If this is the first call from *any* caller since the most recent
+	 * entry from user space, initialize the task context cookie and
+	 * schedule the task work.
+	 */
+	if (!cookie) {
+		u64 ctx_ctr = __this_cpu_read(unwind_ctx_ctr);
+		u64 cpu = raw_smp_processor_id();
+
+		cookie = ctx_to_cookie(cpu, ctx_ctr);
+
+		/*
+		 * If called after task work has sent an unwind to the callback
+		 * function but before the exit to user space, skip it as the
+		 * previous call to the callback function should suffice.
+		 *
+		 * The only exception is if this task has migrated to another
+		 * CPU since the first call to unwind_user_deferred().  The
+		 * per-CPU context counter will have changed which will result
+		 * in a new cookie and another unwind (see comment above
+		 * function).
+		 */
+		if (cookie == info->last_cookies[idx])
+			goto done;
+
+		info->ctx_cookie = cookie;
+		task_work_add(current, &info->work, TWA_RESUME);
+	}
+
+	info->pending_callbacks |= (1 << idx);
+	info->privs[idx] = data;
+	info->last_cookies[idx] = cookie;
+
+done:
+	if (ctx_cookie)
+		*ctx_cookie = cookie;
+	return 0;
+}
+
+static void unwind_user_task_work(struct callback_head *head)
+{
+	struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
+	struct task_struct *task = container_of(info, struct task_struct, unwind_task_info);
+	void *privs[UNWIND_MAX_CALLBACKS];
+	struct unwind_stacktrace trace;
+	unsigned long pending;
+	u64 cookie = 0;
+	int i;
+
+	BUILD_BUG_ON(UNWIND_MAX_CALLBACKS > 32);
+
+	if (WARN_ON_ONCE(task != current))
+		return;
+
+	if (WARN_ON_ONCE(!info->ctx_cookie || !info->pending_callbacks))
+		return;
+
+	scoped_guard(irqsave) {
+		pending = info->pending_callbacks;
+		cookie = info->ctx_cookie;
+
+		info->pending_callbacks = 0;
+		info->ctx_cookie = 0;
+		memcpy(privs, info->privs, sizeof(void *) * UNWIND_MAX_CALLBACKS);
+	}
+
+	if (!info->entries) {
+		info->entries = kmalloc(UNWIND_MAX_ENTRIES * sizeof(long),
+					GFP_KERNEL);
+		if (!info->entries)
+			return;
+	}
+
+	trace.entries = info->entries;
+	trace.nr = 0;
+	unwind_user(&trace, UNWIND_MAX_ENTRIES);
+
+	guard(rwsem_read)(&callbacks_rwsem);
+
+	for_each_set_bit(i, &pending, UNWIND_MAX_CALLBACKS) {
+		if (callbacks[i])
+			callbacks[i]->func(&trace, cookie, privs[i]);
+	}
+}
+
+int unwind_user_register(struct unwind_callback *callback, unwind_callback_t func)
+{
+	scoped_guard(rwsem_write, &callbacks_rwsem) {
+		for (int i = 0; i < UNWIND_MAX_CALLBACKS; i++) {
+			if (!callbacks[i]) {
+				callback->func = func;
+				callback->idx = i;
+				callbacks[i] = callback;
+				return 0;
+			}
+		}
+	}
+
+	callback->func = NULL;
+	callback->idx = -1;
+	return -ENOSPC;
+}
+
+int unwind_user_unregister(struct unwind_callback *callback)
+{
+	if (callback->idx < 0)
+		return -EINVAL;
+
+	scoped_guard(rwsem_write, &callbacks_rwsem)
+		callbacks[callback->idx] = NULL;
+
+	callback->func = NULL;
+	callback->idx = -1;
+
+	return 0;
+}
+
+void unwind_task_init(struct task_struct *task)
+{
+	struct unwind_task_info *info = &task->unwind_task_info;
+
+	info->entries		= NULL;
+	info->pending_callbacks	= 0;
+	info->ctx_cookie	= 0;
+
+	memset(info->last_cookies, 0, sizeof(u64) * UNWIND_MAX_CALLBACKS);
+	memset(info->privs,	   0, sizeof(u64) * UNWIND_MAX_CALLBACKS);
+
+	init_task_work(&info->work, unwind_user_task_work);
+}
+
+void unwind_task_free(struct task_struct *task)
+{
+	struct unwind_task_info *info = &task->unwind_task_info;
+
+	kfree(info->entries);
+}
-- 
2.47.0

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Andrii Nakryiko 3 weeks, 5 days ago

On Mon, Oct 28, 2024 at 2:48 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> Add unwind_user_deferred() which allows callers to schedule task work to
> unwind the user space stack before returning to user space.  This solves
> several problems for its callers:
>
>   - Ensure the unwind happens in task context even if the caller may
>     running in interrupt context.
>
>   - Only do the unwind once, even if called multiple times either by the
>     same caller or multiple callers.
>
>   - Create a "context context" cookie which allows trace post-processing
>     to correlate kernel unwinds/traces with the user unwind.
>
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> ---
>  include/linux/entry-common.h |   3 +
>  include/linux/sched.h        |   5 +
>  include/linux/unwind_user.h  |  56 ++++++++++
>  kernel/fork.c                |   4 +
>  kernel/unwind/user.c         | 199 +++++++++++++++++++++++++++++++++++
>  5 files changed, 267 insertions(+)
>
> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> index 1e50cdb83ae5..efbe8f964f31 100644
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -12,6 +12,7 @@
>  #include <linux/resume_user_mode.h>
>  #include <linux/tick.h>
>  #include <linux/kmsan.h>
> +#include <linux/unwind_user.h>
>
>  #include <asm/entry-common.h>
>
> @@ -111,6 +112,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
>         CT_WARN_ON(__ct_state() != CT_STATE_USER);
>         user_exit_irqoff();
>
> +       unwind_enter_from_user_mode();
> +
>         instrumentation_begin();
>         kmsan_unpoison_entry_regs(regs);
>         trace_hardirqs_off_finish();
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 5007a8e2d640..31b6f1d763ef 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -47,6 +47,7 @@
>  #include <linux/livepatch_sched.h>
>  #include <linux/uidgid_types.h>
>  #include <asm/kmap_size.h>
> +#include <linux/unwind_user.h>
>
>  /* task_struct member predeclarations (sorted alphabetically): */
>  struct audit_context;
> @@ -1592,6 +1593,10 @@ struct task_struct {
>         struct user_event_mm            *user_event_mm;
>  #endif
>
> +#ifdef CONFIG_UNWIND_USER
> +       struct unwind_task_info         unwind_task_info;

this is quite a lot of memory to pay on each task, a lot of which a)
might not have sframe and b) might not need stack unwinding during
their lifetime.

It can be a pointer and allocated in copy_process(), no? Though
ideally this should be initialized lazily, if possible.

> +#endif
> +
>         /*
>          * New fields for task_struct should be added above here, so that
>          * they are included in the randomized portion of task_struct.

[...]

> +static void unwind_user_task_work(struct callback_head *head)
> +{
> +       struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
> +       struct task_struct *task = container_of(info, struct task_struct, unwind_task_info);
> +       void *privs[UNWIND_MAX_CALLBACKS];
> +       struct unwind_stacktrace trace;
> +       unsigned long pending;
> +       u64 cookie = 0;
> +       int i;
> +
> +       BUILD_BUG_ON(UNWIND_MAX_CALLBACKS > 32);
> +
> +       if (WARN_ON_ONCE(task != current))
> +               return;
> +
> +       if (WARN_ON_ONCE(!info->ctx_cookie || !info->pending_callbacks))
> +               return;
> +
> +       scoped_guard(irqsave) {
> +               pending = info->pending_callbacks;
> +               cookie = info->ctx_cookie;
> +
> +               info->pending_callbacks = 0;
> +               info->ctx_cookie = 0;
> +               memcpy(privs, info->privs, sizeof(void *) * UNWIND_MAX_CALLBACKS);
> +       }
> +
> +       if (!info->entries) {
> +               info->entries = kmalloc(UNWIND_MAX_ENTRIES * sizeof(long),
> +                                       GFP_KERNEL);
> +               if (!info->entries)
> +                       return;

uhm... can we notify callbacks that stack capture failed? otherwise
we'd need some extra timeouts and other complications if we are
*waiting* for this callback to be called

> +       }
> +
> +       trace.entries = info->entries;
> +       trace.nr = 0;
> +       unwind_user(&trace, UNWIND_MAX_ENTRIES);
> +

[...]

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Josh Poimboeuf 3 weeks, 5 days ago

On Tue, Oct 29, 2024 at 04:32:46PM -0700, Andrii Nakryiko wrote:
> >  struct audit_context;
> > @@ -1592,6 +1593,10 @@ struct task_struct {
> >         struct user_event_mm            *user_event_mm;
> >  #endif
> >
> > +#ifdef CONFIG_UNWIND_USER
> > +       struct unwind_task_info         unwind_task_info;
> 
> this is quite a lot of memory to pay on each task, a lot of which a)
> might not have sframe

Frame pointers are also supported.

> and b) might not need stack unwinding during their lifetime.

Right, I'm not super happy about that.

> It can be a pointer and allocated in copy_process(), no?
> Though ideally this should be initialized lazily, if possible.

Problem is, the unwinder doesn't know in advance which tasks will be
unwound.

Its first clue is unwind_user_register(), would it make sense for the
caller to clarify whether all tasks need to be unwound or only a
specific subset?

Its second clue is unwind_user_deferred(), which is called for the task
itself.  But by then it's too late because it needs to access the
per-task data from (potentially) irq context so it can't do a lazy
allocation.

I'm definitely open to ideas...

> > +       if (!info->entries) {
> > +               info->entries = kmalloc(UNWIND_MAX_ENTRIES * sizeof(long),
> > +                                       GFP_KERNEL);
> > +               if (!info->entries)
> > +                       return;
> 
> uhm... can we notify callbacks that stack capture failed? otherwise
> we'd need some extra timeouts and other complications if we are
> *waiting* for this callback to be called

Hm, do you actually plan to wait for the callback?

I assume this is BPF, can you give some high-level detail about how it
will use these interfaces?

-- 
Josh

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Andrii Nakryiko 3 weeks, 3 days ago

On Tue, Oct 29, 2024 at 11:10 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> On Tue, Oct 29, 2024 at 04:32:46PM -0700, Andrii Nakryiko wrote:
> > >  struct audit_context;
> > > @@ -1592,6 +1593,10 @@ struct task_struct {
> > >         struct user_event_mm            *user_event_mm;
> > >  #endif
> > >
> > > +#ifdef CONFIG_UNWIND_USER
> > > +       struct unwind_task_info         unwind_task_info;
> >
> > this is quite a lot of memory to pay on each task, a lot of which a)
> > might not have sframe
>
> Frame pointers are also supported.
>
> > and b) might not need stack unwinding during their lifetime.
>
> Right, I'm not super happy about that.
>
> > It can be a pointer and allocated in copy_process(), no?
> > Though ideally this should be initialized lazily, if possible.
>
> Problem is, the unwinder doesn't know in advance which tasks will be
> unwound.
>
> Its first clue is unwind_user_register(), would it make sense for the
> caller to clarify whether all tasks need to be unwound or only a
> specific subset?
>
> Its second clue is unwind_user_deferred(), which is called for the task
> itself.  But by then it's too late because it needs to access the
> per-task data from (potentially) irq context so it can't do a lazy
> allocation.
>
> I'm definitely open to ideas...

The laziest thing would be to perform GFP_ATOMIC allocation, and if
that fails, oops, too bad, no stack trace for you (but, generally
speaking, no big deal). Advantages are clear, though, right? Single
pointer in task_struct, which most of the time will be NULL, so no
unnecessary overheads.

>
> > > +       if (!info->entries) {
> > > +               info->entries = kmalloc(UNWIND_MAX_ENTRIES * sizeof(long),
> > > +                                       GFP_KERNEL);
> > > +               if (!info->entries)
> > > +                       return;
> >
> > uhm... can we notify callbacks that stack capture failed? otherwise
> > we'd need some extra timeouts and other complications if we are
> > *waiting* for this callback to be called
>
> Hm, do you actually plan to wait for the callback?
>
> I assume this is BPF, can you give some high-level detail about how it
> will use these interfaces?

So I have a presentation about integrating async stack trace capture
into BPF at last LSF/MM/BPF, LWN.net has an article ([0]), which you
might want to skim through, not sure if that will be helpful.

But very briefly, the way I see it, we'll have a workflow similar to
the following:

1. BPF program will call some new API to request stack trace capture:
bpf_get_stack_async(...)
  - it will pass a reference to BPF ringbuf into which captured stack
trace should go
  - this API will return unique ID (similar to your cookie, but I'll
need a unique cookie for each call to bpf_get_stack_async() call, even
if all of them capture the same stack trace; so what Mathie is
proposing with coalescing all requests and triggering one callback
isn't very convenient in this case, but we can probably work around
that in BPF-specific way)
2. bpf_get_stack_async() calls unwind_user_deferred() and it expects
one of two outcomes:
  - either successful capture, at which point we'll submit data into
BPF ringbuf with that unique ID for user to correlate whatever data
was captured synchronously and stack trace that arrived later
  - OR we failed to get a stack trace, and we'll still put a small
piece of information for user to know that stack trace is never
coming.

It's the last point that's important to make usability so much
simpler, avoiding unnecessary custom timeouts and stuff like that.
Regardless whether stack trace capture is success or not, user is
guaranteed to get a "notification" about the outcome.

Hope this helps.

But basically, if I I called unwind_user_deferred(), I expect to get
some callback, guaranteed, with the result or failure. The only thing
that's not guaranteed (and which makes timeouts bad) is *when* this
will happen. Because stack trace capture can be arbitrarily delayed
and stuff. That's fine, but that also shows why timeout is tricky and
necessarily fragile.


  [0] https://lwn.net/Articles/978736/

>
> --
> Josh

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Josh Poimboeuf 3 weeks, 3 days ago

On Thu, Oct 31, 2024 at 02:22:48PM -0700, Andrii Nakryiko wrote:
> > Problem is, the unwinder doesn't know in advance which tasks will be
> > unwound.
> >
> > Its first clue is unwind_user_register(), would it make sense for the
> > caller to clarify whether all tasks need to be unwound or only a
> > specific subset?
> >
> > Its second clue is unwind_user_deferred(), which is called for the task
> > itself.  But by then it's too late because it needs to access the
> > per-task data from (potentially) irq context so it can't do a lazy
> > allocation.
> >
> > I'm definitely open to ideas...
> 
> The laziest thing would be to perform GFP_ATOMIC allocation, and if
> that fails, oops, too bad, no stack trace for you (but, generally
> speaking, no big deal). Advantages are clear, though, right? Single
> pointer in task_struct, which most of the time will be NULL, so no
> unnecessary overheads.

GFP_ATOMIC is limited, I don't think we want the unwinder to trigger
OOM.

> It's the last point that's important to make usability so much
> simpler, avoiding unnecessary custom timeouts and stuff like that.
> Regardless whether stack trace capture is success or not, user is
> guaranteed to get a "notification" about the outcome.
> 
> Hope this helps.
> 
> But basically, if I I called unwind_user_deferred(), I expect to get
> some callback, guaranteed, with the result or failure. The only thing
> that's not guaranteed (and which makes timeouts bad) is *when* this
> will happen. Because stack trace capture can be arbitrarily delayed
> and stuff. That's fine, but that also shows why timeout is tricky and
> necessarily fragile.

That sounds reasonable.  In the OOM error case I can just pass a small
(stack allocated) one-entry trace with only regs->ip.

-- 
Josh

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Andrii Nakryiko 3 weeks, 3 days ago

On Thu, Oct 31, 2024 at 4:13 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> On Thu, Oct 31, 2024 at 02:22:48PM -0700, Andrii Nakryiko wrote:
> > > Problem is, the unwinder doesn't know in advance which tasks will be
> > > unwound.
> > >
> > > Its first clue is unwind_user_register(), would it make sense for the
> > > caller to clarify whether all tasks need to be unwound or only a
> > > specific subset?
> > >
> > > Its second clue is unwind_user_deferred(), which is called for the task
> > > itself.  But by then it's too late because it needs to access the
> > > per-task data from (potentially) irq context so it can't do a lazy
> > > allocation.
> > >
> > > I'm definitely open to ideas...
> >
> > The laziest thing would be to perform GFP_ATOMIC allocation, and if
> > that fails, oops, too bad, no stack trace for you (but, generally
> > speaking, no big deal). Advantages are clear, though, right? Single
> > pointer in task_struct, which most of the time will be NULL, so no
> > unnecessary overheads.
>
> GFP_ATOMIC is limited, I don't think we want the unwinder to trigger
> OOM.
>

So all task_structs on the system using 104 bytes more, *permanently*
and *unconditionally*, is not a concern, but lazy GFP_ATOMIC
allocation when you actually need it is?

> > It's the last point that's important to make usability so much
> > simpler, avoiding unnecessary custom timeouts and stuff like that.
> > Regardless whether stack trace capture is success or not, user is
> > guaranteed to get a "notification" about the outcome.
> >
> > Hope this helps.
> >
> > But basically, if I I called unwind_user_deferred(), I expect to get
> > some callback, guaranteed, with the result or failure. The only thing
> > that's not guaranteed (and which makes timeouts bad) is *when* this
> > will happen. Because stack trace capture can be arbitrarily delayed
> > and stuff. That's fine, but that also shows why timeout is tricky and
> > necessarily fragile.
>
> That sounds reasonable.  In the OOM error case I can just pass a small
> (stack allocated) one-entry trace with only regs->ip.
>

SGTM

> --
> Josh
>

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Josh Poimboeuf 3 weeks, 2 days ago

On Thu, Oct 31, 2024 at 04:28:08PM -0700, Andrii Nakryiko wrote:
> So all task_structs on the system using 104 bytes more, *permanently*

Either way it's permanent, we don't know when to free it until the task
struct is freed...

> and *unconditionally*, is not a concern

Of course it's a concern, that's why we're looking for something
better...

> but lazy GFP_ATOMIC allocation when you actually need it is?

We don't want to dip into the GFP_ATOMIC emergency reserves, those are
kept for more important things.

Actually, I think I can just use GFP_NOWAIT here.

-- 
Josh

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Andrii Nakryiko 3 weeks, 2 days ago

On Fri, Nov 1, 2024 at 10:41 AM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> On Thu, Oct 31, 2024 at 04:28:08PM -0700, Andrii Nakryiko wrote:
> > So all task_structs on the system using 104 bytes more, *permanently*
>
> Either way it's permanent, we don't know when to free it until the task
> struct is freed...
>

I'm not sure if we are arguing for the sake of arguing at this point
:) Yes, for *those tasks* for which we at least once requested stack
trace, that memory will stay, sure. But there are normally tons of
threads that are almost completely idle and/or use so little CPU, that
they won't ever be caught in the profiler, so their stack trace will
never be requested.

Sure, you can come up with a use case where you'll just go over each
task and ask for stack trace for each of them, but that's not a common
case.

So, sorry, but no, I don't agree that these are equivalent things.
Lazy memory allocation is a must, IMO.

> > and *unconditionally*, is not a concern
>
> Of course it's a concern, that's why we're looking for something
> better...
>
> > but lazy GFP_ATOMIC allocation when you actually need it is?
>
> We don't want to dip into the GFP_ATOMIC emergency reserves, those are
> kept for more important things.
>
> Actually, I think I can just use GFP_NOWAIT here.

Whatever semantics works for being called from NMI (even if it can fail).

>
> --
> Josh

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Peter Zijlstra 3 weeks, 5 days ago

On Mon, Oct 28, 2024 at 02:47:38PM -0700, Josh Poimboeuf wrote:

> + * The only exception is when the task has migrated to another CPU, *and* this
> + * is called while the task work is running (or has already run).  Then a new
> + * cookie will be generated and the callback will be called again for the new
> + * cookie.

So that's a bit crap. The user stack won't change for having been
migrated.

So perf can readily use the full u64 cookie value as a sequence number,
since the whole perf record will already have the TID of the task in.
Mixing in this CPU number for no good reason and causing trouble like
this just doesn't make sense to me.

If ftrace needs brain damage like this, can't we push this to the user?

That is, do away with the per-cpu sequence crap, and add a per-task
counter that is incremented for every return-to-userspace.

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Josh Poimboeuf 3 weeks, 5 days ago

On Tue, Oct 29, 2024 at 02:56:17PM +0100, Peter Zijlstra wrote:
> On Mon, Oct 28, 2024 at 02:47:38PM -0700, Josh Poimboeuf wrote:
> 
> > + * The only exception is when the task has migrated to another CPU, *and* this
> > + * is called while the task work is running (or has already run).  Then a new
> > + * cookie will be generated and the callback will be called again for the new
> > + * cookie.
> 
> So that's a bit crap. The user stack won't change for having been
> migrated.
> 
> So perf can readily use the full u64 cookie value as a sequence number,
> since the whole perf record will already have the TID of the task in.
> Mixing in this CPU number for no good reason and causing trouble like
> this just doesn't make sense to me.
> 
> If ftrace needs brain damage like this, can't we push this to the user?
> 
> That is, do away with the per-cpu sequence crap, and add a per-task
> counter that is incremented for every return-to-userspace.

That would definitely make things easier for me, though IIRC Steven and
Mathieu had some concerns about TID wrapping over days/months/years.

With that mindset I suppose the per-CPU counter could also wrap, though
that could be mitigated by making the cookie a struct with more bits.

-- 
Josh

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Mathieu Desnoyers 3 weeks, 5 days ago

On 2024-10-29 13:17, Josh Poimboeuf wrote:
> On Tue, Oct 29, 2024 at 02:56:17PM +0100, Peter Zijlstra wrote:
>> On Mon, Oct 28, 2024 at 02:47:38PM -0700, Josh Poimboeuf wrote:
>>
>>> + * The only exception is when the task has migrated to another CPU, *and* this
>>> + * is called while the task work is running (or has already run).  Then a new
>>> + * cookie will be generated and the callback will be called again for the new
>>> + * cookie.
>>
>> So that's a bit crap. The user stack won't change for having been
>> migrated.
>>
>> So perf can readily use the full u64 cookie value as a sequence number,
>> since the whole perf record will already have the TID of the task in.
>> Mixing in this CPU number for no good reason and causing trouble like
>> this just doesn't make sense to me.
>>
>> If ftrace needs brain damage like this, can't we push this to the user?
>>
>> That is, do away with the per-cpu sequence crap, and add a per-task
>> counter that is incremented for every return-to-userspace.
> 
> That would definitely make things easier for me, though IIRC Steven and
> Mathieu had some concerns about TID wrapping over days/months/years.
> 
> With that mindset I suppose the per-CPU counter could also wrap, though
> that could be mitigated by making the cookie a struct with more bits.
> 

AFAIR, the scheme we discussed in Prague was different than the
implementation here.

We discussed having a free-running counter per-cpu, and combining it
with the cpu number as top (or low) bits, to effectively make a 64-bit
value that is unique across the entire system, but without requiring a
global counter with its associated cache line bouncing.

Here is part where the implementation here differs from our discussion:
I recall we discussed keeping a snapshot of the counter value within
the task struct of the thread. So we only snapshot the per-cpu value
on first use after entering the kernel, and after that we use the same
per-cpu value snapshot (from task struct) up until return to userspace.
We clear the task struct cookie snapshot on return to userspace.

This way, even if the thread is migrated during the system call, the
cookie value does not change: it simply depends on the point where it
was first snapshotted (either before or after migration). From that
point until return to userspace, we just use the per-thread snapshot
value.

This should allow us to keep a global cookie semantic (no need to
tie this to tracer-specific knowledge about current TID), without the
migration corner cases discussed in the comment above.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Josh Poimboeuf 3 weeks, 5 days ago

On Tue, Oct 29, 2024 at 01:47:59PM -0400, Mathieu Desnoyers wrote:
> > > If ftrace needs brain damage like this, can't we push this to the user?
> > > 
> > > That is, do away with the per-cpu sequence crap, and add a per-task
> > > counter that is incremented for every return-to-userspace.
> > 
> > That would definitely make things easier for me, though IIRC Steven and
> > Mathieu had some concerns about TID wrapping over days/months/years.
> > 
> > With that mindset I suppose the per-CPU counter could also wrap, though
> > that could be mitigated by making the cookie a struct with more bits.
> > 
> 
> AFAIR, the scheme we discussed in Prague was different than the
> implementation here.

It does differ a bit.  I'll explain why below.

> We discussed having a free-running counter per-cpu, and combining it
> with the cpu number as top (or low) bits, to effectively make a 64-bit
> value that is unique across the entire system, but without requiring a
> global counter with its associated cache line bouncing.
> 
> Here is part where the implementation here differs from our discussion:
> I recall we discussed keeping a snapshot of the counter value within
> the task struct of the thread. So we only snapshot the per-cpu value
> on first use after entering the kernel, and after that we use the same
> per-cpu value snapshot (from task struct) up until return to userspace.
> We clear the task struct cookie snapshot on return to userspace.

Right, so adding some details to this, what I remember specifically
deciding on:

 - In unwind_user_deferred(), if task cookie is 0, it snapshots the
   per-cpu counter, stores the old value in the task cookie, and
   increments the new value (with CPU # in top 48 bits).

 - Future calls to unwind_user_deferred() see the task cookie is set and
   reuse the same cookie.

 - Then the task work (which runs right before exiting the kernel)
   clears the task cookie (with interrupts disabled), does the unwind,
   and calls the callbacks.  It clears the task cookie so that any
   future calls to unwind_user_deferred() (either before exiting the
   kernel or after next entry) are guaranteed to get an unwind.

That's what I had implemented originally.  It had a major performance
issue, particularly for long stacks (bash sometimes had 300+ stack
frames in my testing).

The task work runs with interrupts enabled.  So if another PMU interrupt
and call to unwind_user_deferred() happens after the task work clears
the task cookie but before kernel exit, a new cookie is generated and an
additional task work is scheduled.  For long stacks, performance gets
really bad, dominated by all the extra unnecessary unwinding.

So I changed the design a bit:

  - Increment a per-cpu counter at kernel entry before interrupts are
    enabled.

  - In unwind_user_deferred(), if task cookie is 0, it sets the task
    cookie based on the per-cpu counter, like before.  However if this
    cookie was the last one used by this callback+task, it skips the
    callback altogether.

So it eliminates duplicate unwinds except for the CPU migration case.

If I change the entry code to increment a per-task counter instead of a
per-cpu counter then this problem goes away.  I was just concerned about
the performance impacts of doing that on every entry.

-- 
Josh

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Mathieu Desnoyers 3 weeks, 4 days ago

On 2024-10-29 14:34, Josh Poimboeuf wrote:
> On Tue, Oct 29, 2024 at 01:47:59PM -0400, Mathieu Desnoyers wrote:
>>>> If ftrace needs brain damage like this, can't we push this to the user?
>>>>
>>>> That is, do away with the per-cpu sequence crap, and add a per-task
>>>> counter that is incremented for every return-to-userspace.
>>>
>>> That would definitely make things easier for me, though IIRC Steven and
>>> Mathieu had some concerns about TID wrapping over days/months/years.
>>>
>>> With that mindset I suppose the per-CPU counter could also wrap, though
>>> that could be mitigated by making the cookie a struct with more bits.
>>>
>>
>> AFAIR, the scheme we discussed in Prague was different than the
>> implementation here.
> 
> It does differ a bit.  I'll explain why below.
> 
>> We discussed having a free-running counter per-cpu, and combining it
>> with the cpu number as top (or low) bits, to effectively make a 64-bit
>> value that is unique across the entire system, but without requiring a
>> global counter with its associated cache line bouncing.
>>
>> Here is part where the implementation here differs from our discussion:
>> I recall we discussed keeping a snapshot of the counter value within
>> the task struct of the thread. So we only snapshot the per-cpu value
>> on first use after entering the kernel, and after that we use the same
>> per-cpu value snapshot (from task struct) up until return to userspace.
>> We clear the task struct cookie snapshot on return to userspace.
> 
> Right, so adding some details to this, what I remember specifically
> deciding on:
> 
>   - In unwind_user_deferred(), if task cookie is 0, it snapshots the
>     per-cpu counter, stores the old value in the task cookie, and
>     increments the new value (with CPU # in top 48 bits).
> 
>   - Future calls to unwind_user_deferred() see the task cookie is set and
>     reuse the same cookie.
> 
>   - Then the task work (which runs right before exiting the kernel)
>     clears the task cookie (with interrupts disabled), does the unwind,
>     and calls the callbacks.  It clears the task cookie so that any
>     future calls to unwind_user_deferred() (either before exiting the
>     kernel or after next entry) are guaranteed to get an unwind.

This is where I think we should improve the logic.

If an unwind_user_deferred() is called right over/after the unwind,
I don't think we want to issue another stack walk: it would be redundant
with the one already in progress or completed.

What we'd want is to make sure that the cookie returned to that
unwind_user_deferred() is the same as the cookie associated with the
in-progress or completed stack unwind. This way, trace analysis can
look at the surrounding events (before and after) the unwind request
and find the associated call stack.

> 
> That's what I had implemented originally.  It had a major performance
> issue, particularly for long stacks (bash sometimes had 300+ stack
> frames in my testing).

That's probably because long stacks require a lot of work (user pages
accesses) to be done when stack walking, which increases the likeliness
of re-issuing unwind_user_deferred() while the stack walk is being done.

That's basically a lack-of-progress issue: a sufficiently long stack
walk with a sufficiently frequent unwind_user_deferred() trigger could
make the system stall forever trying to service stalk walks again
and again. That's bad.

> 
> The task work runs with interrupts enabled.  So if another PMU interrupt
> and call to unwind_user_deferred() happens after the task work clears
> the task cookie but before kernel exit, a new cookie is generated and an
> additional task work is scheduled.  For long stacks, performance gets
> really bad, dominated by all the extra unnecessary unwinding.

What you want here is to move the point where you clear the task
cookie to _after_ completion of stack unwind. There are a few ways
this can be done:

A) Clear the task cookie in the task_work() right after the
    unwind_user_deferred() is completed. Downside: if some long task work
    happen to be done after the stack walk, a new unwind_user_deferred()
    could be issued again and we may end up looping forever taking stack
    unwind and never actually making forward progress.

B) Clear the task cookie after the exit_to_user_mode_loop is done,
    before returning to user-space, while interrupts are disabled.

C) Clear the task cookie when entering kernel space again.

I think (B) and (C) could be interesting approaches, perhaps with
(B) slightly more interesting because it cleans up after itself
before returning to userspace. Also (B) allows us to return to a
purely lazy scheme where there is nothing to do when entering the
kernel. This is therefore more efficient in terms of cache locality,
because we can expect our per-task state to be cache hot when
returning to userspace, but not necessarily when entering into
the kernel.

> 
> So I changed the design a bit:
> 
>    - Increment a per-cpu counter at kernel entry before interrupts are
>      enabled.
> 
>    - In unwind_user_deferred(), if task cookie is 0, it sets the task
>      cookie based on the per-cpu counter, like before.  However if this
>      cookie was the last one used by this callback+task, it skips the
>      callback altogether.
> 
> So it eliminates duplicate unwinds except for the CPU migration case.

This sounds complicated and fragile. And why would we care about
duplicated unwinds on cpu migration ?

What prevents us from moving the per-task cookie clearing to after
exit_to_user_mode_loop instead ? Then there is no need to do per-cpu
counter increment on every kernel entry and we can go back to a lazy
scheme.

> 
> If I change the entry code to increment a per-task counter instead of a
> per-cpu counter then this problem goes away.  I was just concerned about
> the performance impacts of doing that on every entry.

Moving from per-cpu to per-task makes this cookie task-specific and not
global anymore, I don't think we want this for a stack walking
infrastructure meant to be used by various tracers. Also a global cookie
is more robust and does not depend on guaranteeing that all the
trace data is present to guarantee current thread ID accuracy and
thus that cookies match between deferred unwind request and their
fulfillment.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Josh Poimboeuf 3 weeks, 4 days ago

On Wed, Oct 30, 2024 at 09:44:14AM -0400, Mathieu Desnoyers wrote:
> What you want here is to move the point where you clear the task
> cookie to _after_ completion of stack unwind. There are a few ways
> this can be done:
> 
> A) Clear the task cookie in the task_work() right after the
>    unwind_user_deferred() is completed. Downside: if some long task work
>    happen to be done after the stack walk, a new unwind_user_deferred()
>    could be issued again and we may end up looping forever taking stack
>    unwind and never actually making forward progress.
> 
> B) Clear the task cookie after the exit_to_user_mode_loop is done,
>    before returning to user-space, while interrupts are disabled.

Problem is, if another tracer calls unwind_user_deferred() for the first
time, after the task work but before the task cookie gets cleared, it
will see the cookie is non-zero and will fail to schedule another task
work.  So its callback never gets called.

> > If I change the entry code to increment a per-task counter instead of a
> > per-cpu counter then this problem goes away.  I was just concerned about
> > the performance impacts of doing that on every entry.
> 
> Moving from per-cpu to per-task makes this cookie task-specific and not
> global anymore, I don't think we want this for a stack walking
> infrastructure meant to be used by various tracers. Also a global cookie
> is more robust and does not depend on guaranteeing that all the
> trace data is present to guarantee current thread ID accuracy and
> thus that cookies match between deferred unwind request and their
> fulfillment.

I don't disagree.  What I meant was, on entry (or exit), increment the
task cookie *with* the CPU bits included.

Or as you suggested previously, the "cookie" just be a struct with two
fields: CPU # and per-task entry counter.

-- 
Josh

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Josh Poimboeuf 3 weeks, 4 days ago

On Wed, Oct 30, 2024 at 10:47:55AM -0700, Josh Poimboeuf wrote:
> On Wed, Oct 30, 2024 at 09:44:14AM -0400, Mathieu Desnoyers wrote:
> > What you want here is to move the point where you clear the task
> > cookie to _after_ completion of stack unwind. There are a few ways
> > this can be done:
> > 
> > A) Clear the task cookie in the task_work() right after the
> >    unwind_user_deferred() is completed. Downside: if some long task work
> >    happen to be done after the stack walk, a new unwind_user_deferred()
> >    could be issued again and we may end up looping forever taking stack
> >    unwind and never actually making forward progress.
> > 
> > B) Clear the task cookie after the exit_to_user_mode_loop is done,
> >    before returning to user-space, while interrupts are disabled.
> 
> Problem is, if another tracer calls unwind_user_deferred() for the first
> time, after the task work but before the task cookie gets cleared, it
> will see the cookie is non-zero and will fail to schedule another task
> work.  So its callback never gets called.

How about we:

  - clear task cookie on entry or exit from user space

  - use a different variable (rather than clearing of task cookie) to
    communicate whether task work is pending

  - keep track of which callbacks have been called for this cookie

something like so?

int unwind_user_deferred(struct unwind_callback *callback, u64 *ctx_cookie, void *data)
{
	struct unwind_task_info *info = &current->unwind_task_info;
	unsigned int work_pending = work_pending;
	u64 cookie = info->ctx_cookie;
	int idx = callback->idx;

	if (WARN_ON_ONCE(in_nmi()))
		return -EINVAL;

	if (!current->mm)
		return -EINVAL;

	guard(irqsave)();

	if (cookie && (info->pending_callbacks & (1 << idx))) {
		/* callback already scheduled */
		goto done;
	}

	/*
	 * If this is the first call from any caller since the most recent
	 * entry from user space, initialize the task context cookie.
	 */
	if (!cookie) {
		u64 cpu = raw_smp_processor_id();
		u64 ctx_ctr;

		__this_cpu_inc(unwind_ctx_ctr);
		ctx_ctr = __this_cpu_read(unwind_ctx_ctr);

		cookie = ctx_to_cookie(cpu, ctx_ctr);
		info->ctx_cookie = cookie;

	} else {
		if (cookie == info->last_cookies[idx]) {
			/*
			 * The callback has already been called with this unwind,
			 * nothing to do.
			 */
			goto done;
		}

		if (info->pending_callbacks & (1 << idx)) {
			/* callback already scheduled */
			goto done;
		}
	}

	info->pending_callbacks |= (1 << idx);
	info->privs[idx] = data;
	info->last_cookies[idx] = cookie;

	if (!info->task_pending) {
		info->task_pending = 1; /* cleared by unwind_user_task_work() */
		task_work_add(current, &info->work, TWA_RESUME);
	}

done:
	if (ctx_cookie)
		*ctx_cookie = cookie;
	return 0;
}

-- 
Josh

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Josh Poimboeuf 3 weeks, 4 days ago

On Wed, Oct 30, 2024 at 10:47:55AM -0700, Josh Poimboeuf wrote:
> > Moving from per-cpu to per-task makes this cookie task-specific and not
> > global anymore, I don't think we want this for a stack walking
> > infrastructure meant to be used by various tracers. Also a global cookie
> > is more robust and does not depend on guaranteeing that all the
> > trace data is present to guarantee current thread ID accuracy and
> > thus that cookies match between deferred unwind request and their
> > fulfillment.
> 
> I don't disagree.  What I meant was, on entry (or exit), increment the
> task cookie *with* the CPU bits included.
> 
> Or as you suggested previously, the "cookie" just be a struct with two
> fields: CPU # and per-task entry counter.

Never mind, that wouldn't work...  Two tasks could have identical
per-task counters while being scheduled on the same CPU.

-- 
Josh

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Peter Zijlstra 3 weeks, 5 days ago

On Tue, Oct 29, 2024 at 01:47:59PM -0400, Mathieu Desnoyers wrote:
> On 2024-10-29 13:17, Josh Poimboeuf wrote:
> > On Tue, Oct 29, 2024 at 02:56:17PM +0100, Peter Zijlstra wrote:
> > > On Mon, Oct 28, 2024 at 02:47:38PM -0700, Josh Poimboeuf wrote:
> > > 
> > > > + * The only exception is when the task has migrated to another CPU, *and* this
> > > > + * is called while the task work is running (or has already run).  Then a new
> > > > + * cookie will be generated and the callback will be called again for the new
> > > > + * cookie.
> > > 
> > > So that's a bit crap. The user stack won't change for having been
> > > migrated.
> > > 
> > > So perf can readily use the full u64 cookie value as a sequence number,
> > > since the whole perf record will already have the TID of the task in.
> > > Mixing in this CPU number for no good reason and causing trouble like
> > > this just doesn't make sense to me.
> > > 
> > > If ftrace needs brain damage like this, can't we push this to the user?
> > > 
> > > That is, do away with the per-cpu sequence crap, and add a per-task
> > > counter that is incremented for every return-to-userspace.
> > 
> > That would definitely make things easier for me, though IIRC Steven and
> > Mathieu had some concerns about TID wrapping over days/months/years.
> > 
> > With that mindset I suppose the per-CPU counter could also wrap, though
> > that could be mitigated by making the cookie a struct with more bits.
> > 
> 
> AFAIR, the scheme we discussed in Prague was different than the
> implementation here.
> 
> We discussed having a free-running counter per-cpu, and combining it
> with the cpu number as top (or low) bits, to effectively make a 64-bit
> value that is unique across the entire system, but without requiring a
> global counter with its associated cache line bouncing.
> 
> Here is part where the implementation here differs from our discussion:
> I recall we discussed keeping a snapshot of the counter value within
> the task struct of the thread. So we only snapshot the per-cpu value
> on first use after entering the kernel, and after that we use the same
> per-cpu value snapshot (from task struct) up until return to userspace.
> We clear the task struct cookie snapshot on return to userspace.
> 
> This way, even if the thread is migrated during the system call, the
> cookie value does not change: it simply depends on the point where it
> was first snapshotted (either before or after migration). From that
> point until return to userspace, we just use the per-thread snapshot
> value.
> 
> This should allow us to keep a global cookie semantic (no need to
> tie this to tracer-specific knowledge about current TID), without the
> migration corner cases discussed in the comment above.

The 48:16 bit split gives you uniqueness for around 78 hours at 1GHz.

But seriously, perf doesn't need this. It really only needs a sequence
number if you care to stitch over a LOST packet (and I can't say I care
about that case much) -- and doing that right doesn't really take much
at all.

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Steven Rostedt 3 weeks, 5 days ago

On Tue, 29 Oct 2024 19:20:32 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> The 48:16 bit split gives you uniqueness for around 78 hours at 1GHz.

Are you saying that there will be one system call per nanosecond? This
number is incremented only when a task enters the kernel from user
spaces *and* requests a stack trace. If that happens 1000 times a
second, that would still be around 9000 years.

> 
> But seriously, perf doesn't need this. It really only needs a sequence
> number if you care to stitch over a LOST packet (and I can't say I care
> about that case much) -- and doing that right doesn't really take much
> at all.

Perf may not care because it has a unique descriptor per task, right?
Where it can already know what events are associated to a task. But
that's just a unique characteristic of perf. The unwinder should give a
identifier for every user space stack trace that it will produce and
pass that back to the tracer when it requests a stack trace but it
cannot yet be performed. This identifier is what we are calling a
context cookie. Then when it wants the stack trace, the unwinder will
give the tracer the stack trace along with the identifier
(context-cookie) that this stack trace was for in the past.

It definitely belongs with the undwinder logic.

-- Steve

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Peter Zijlstra 3 weeks, 4 days ago

On Wed, Oct 30, 2024 at 02:17:22AM -0400, Steven Rostedt wrote:
> On Tue, 29 Oct 2024 19:20:32 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > The 48:16 bit split gives you uniqueness for around 78 hours at 1GHz.
> 
> Are you saying that there will be one system call per nanosecond? This
> number is incremented only when a task enters the kernel from user
> spaces *and* requests a stack trace. If that happens 1000 times a
> second, that would still be around 9000 years.

We used to be able to do well over a million syscalls a second. I'm not
exactly sure where we are now, the whole speculation shit-show hurt
things quite badly.

> > 
> > But seriously, perf doesn't need this. It really only needs a sequence
> > number if you care to stitch over a LOST packet (and I can't say I care
> > about that case much) -- and doing that right doesn't really take much
> > at all.
> 
> Perf may not care because it has a unique descriptor per task, right?
> Where it can already know what events are associated to a task. But
> that's just a unique characteristic of perf. The unwinder should give a
> identifier for every user space stack trace that it will produce and
> pass that back to the tracer when it requests a stack trace but it
> cannot yet be performed. This identifier is what we are calling a
> context cookie. Then when it wants the stack trace, the unwinder will
> give the tracer the stack trace along with the identifier
> (context-cookie) that this stack trace was for in the past.

You're designing things inside out again. You should add functionality
by adding layers.

Pass a void * into the 'request-unwind' and have the 'do-unwind'
callback get that same pointer. Then anybody that needs identifiers to
figure out where things came from can stuff something in there.

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Steven Rostedt 3 weeks, 4 days ago

On Wed, 30 Oct 2024 15:03:24 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> You're designing things inside out again. You should add functionality
> by adding layers.
> 
> Pass a void * into the 'request-unwind' and have the 'do-unwind'
> callback get that same pointer. Then anybody that needs identifiers to
> figure out where things came from can stuff something in there.

What the hell should I do with a void pointer? I want a stack trace, I
ask for one, it gives me an identifier for it, then at the end when it
does my callback it gives me the stack trace I asked for with the
identifier that it belongs to.

The identifier should be part of the unwinder code as it has to do with
the stack trace and has nothing to do with tracing.

-- Steve

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Josh Poimboeuf 3 weeks, 4 days ago

On Wed, Oct 30, 2024 at 03:58:16PM -0400, Steven Rostedt wrote:
> On Wed, 30 Oct 2024 15:03:24 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > You're designing things inside out again. You should add functionality
> > by adding layers.
> > 
> > Pass a void * into the 'request-unwind' and have the 'do-unwind'
> > callback get that same pointer. Then anybody that needs identifiers to
> > figure out where things came from can stuff something in there.
> 
> What the hell should I do with a void pointer? I want a stack trace, I
> ask for one, it gives me an identifier for it, then at the end when it
> does my callback it gives me the stack trace I asked for with the
> identifier that it belongs to.
>
> The identifier should be part of the unwinder code as it has to do
> with the stack trace and has nothing to do with tracing.

If at least most of the users would find the cookie useful -- which it
sounds like they would, maybe even including perf -- then it makes sense
to integrate that into the unwinder.  It also helps the unwinder
disambiguate which callbacks have been called and whether the stack has
already been unwound.

That said, I'm also implementing the void pointer thing, as AFAICT perf
needs to pass the perf_event pointer to the callback.  There's no reason
we can't have both.

-- 
Josh

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Peter Zijlstra 3 weeks, 5 days ago

On Mon, Oct 28, 2024 at 02:47:38PM -0700, Josh Poimboeuf wrote:

> +static void unwind_user_task_work(struct callback_head *head)
> +{
...
> +	guard(rwsem_read)(&callbacks_rwsem);
> +
> +	for_each_set_bit(i, &pending, UNWIND_MAX_CALLBACKS) {
> +		if (callbacks[i])
> +			callbacks[i]->func(&trace, cookie, privs[i]);
> +	}

I'm fairly sure people will come with pitchforks for that read-lock
there. They scale like shit on big systems. Please use SRCU or somesuch.

> +}
> +
> +int unwind_user_register(struct unwind_callback *callback, unwind_callback_t func)
> +{
> +	scoped_guard(rwsem_write, &callbacks_rwsem) {
> +		for (int i = 0; i < UNWIND_MAX_CALLBACKS; i++) {
> +			if (!callbacks[i]) {
> +				callback->func = func;
> +				callback->idx = i;
> +				callbacks[i] = callback;
> +				return 0;
> +			}
> +		}
> +	}
> +
> +	callback->func = NULL;
> +	callback->idx = -1;
> +	return -ENOSPC;
> +}
> +
> +int unwind_user_unregister(struct unwind_callback *callback)
> +{
> +	if (callback->idx < 0)
> +		return -EINVAL;
> +
> +	scoped_guard(rwsem_write, &callbacks_rwsem)
> +		callbacks[callback->idx] = NULL;
> +
> +	callback->func = NULL;
> +	callback->idx = -1;
> +
> +	return 0;
> +}

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Josh Poimboeuf 3 weeks, 5 days ago

On Tue, Oct 29, 2024 at 02:49:18PM +0100, Peter Zijlstra wrote:
> On Mon, Oct 28, 2024 at 02:47:38PM -0700, Josh Poimboeuf wrote:
> 
> > +static void unwind_user_task_work(struct callback_head *head)
> > +{
> ...
> > +	guard(rwsem_read)(&callbacks_rwsem);
> > +
> > +	for_each_set_bit(i, &pending, UNWIND_MAX_CALLBACKS) {
> > +		if (callbacks[i])
> > +			callbacks[i]->func(&trace, cookie, privs[i]);
> > +	}
> 
> I'm fairly sure people will come with pitchforks for that read-lock
> there. They scale like shit on big systems. Please use SRCU or somesuch.

I'd expect that unwind_user_{register,unregister}() would only be called
once per tracing component during boot so there wouldn't be any
contention.

But I think I can make SRCU work here.

-- 
Josh

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Peter Zijlstra 3 weeks, 5 days ago

On Tue, Oct 29, 2024 at 10:05:26AM -0700, Josh Poimboeuf wrote:
> On Tue, Oct 29, 2024 at 02:49:18PM +0100, Peter Zijlstra wrote:
> > On Mon, Oct 28, 2024 at 02:47:38PM -0700, Josh Poimboeuf wrote:
> > 
> > > +static void unwind_user_task_work(struct callback_head *head)
> > > +{
> > ...
> > > +	guard(rwsem_read)(&callbacks_rwsem);
> > > +
> > > +	for_each_set_bit(i, &pending, UNWIND_MAX_CALLBACKS) {
> > > +		if (callbacks[i])
> > > +			callbacks[i]->func(&trace, cookie, privs[i]);
> > > +	}
> > 
> > I'm fairly sure people will come with pitchforks for that read-lock
> > there. They scale like shit on big systems. Please use SRCU or somesuch.
> 
> I'd expect that unwind_user_{register,unregister}() would only be called
> once per tracing component during boot so there wouldn't be any
> contention.

The read-lock does an atomic op on the lock word, try and do that with
200+ CPUs and things get really sad.

> But I think I can make SRCU work here.

Thanks!

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Peter Zijlstra 3 weeks, 5 days ago

On Mon, Oct 28, 2024 at 02:47:38PM -0700, Josh Poimboeuf wrote:

> +/*
> + * The "context cookie" is a unique identifier which allows post-processing to
> + * correlate kernel trace(s) with user unwinds.  It has the CPU id the highest
> + * 16 bits and a per-CPU entry counter in the lower 48 bits.
> + */
> +static u64 ctx_to_cookie(u64 cpu, u64 ctx)
> +{
> +	BUILD_BUG_ON(NR_CPUS > 65535);
> +	return (ctx & ((1UL << 48) - 1)) | cpu;
> +}

Did you mean to: (cpu << 48) ?

Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

Posted by Josh Poimboeuf 3 weeks, 5 days ago

On Tue, Oct 29, 2024 at 02:48:07PM +0100, Peter Zijlstra wrote:
> On Mon, Oct 28, 2024 at 02:47:38PM -0700, Josh Poimboeuf wrote:
> 
> > +/*
> > + * The "context cookie" is a unique identifier which allows post-processing to
> > + * correlate kernel trace(s) with user unwinds.  It has the CPU id the highest
> > + * 16 bits and a per-CPU entry counter in the lower 48 bits.
> > + */
> > +static u64 ctx_to_cookie(u64 cpu, u64 ctx)
> > +{
> > +	BUILD_BUG_ON(NR_CPUS > 65535);
> > +	return (ctx & ((1UL << 48) - 1)) | cpu;
> > +}
> 
> Did you mean to: (cpu << 48) ?

Indeed... that was the victim of a late refactoring.

-- 
Josh