[PATCH v10 04/14] unwind_user/deferred: Add unwind_deferred_trace()

Steven Rostedt posted 14 patches 4 months ago
There is a newer version of this series
[PATCH v10 04/14] unwind_user/deferred: Add unwind_deferred_trace()
Posted by Steven Rostedt 4 months ago
From: Steven Rostedt <rostedt@goodmis.org>

Add a function that must be called inside a faultable context that will
retrieve a user space stack trace. The function unwind_deferred_trace()
can be called by a tracer when a task is about to enter user space, or has
just come back from user space and has interrupts enabled.

This code is based on work by Josh Poimboeuf's deferred unwinding code:

Link: https://lore.kernel.org/all/6052e8487746603bdb29b65f4033e739092d9925.1737511963.git.jpoimboe@kernel.org/

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/sched.h                 |  5 +++
 include/linux/unwind_deferred.h       | 24 +++++++++++
 include/linux/unwind_deferred_types.h |  9 ++++
 kernel/fork.c                         |  4 ++
 kernel/unwind/Makefile                |  2 +-
 kernel/unwind/deferred.c              | 60 +++++++++++++++++++++++++++
 6 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/unwind_deferred.h
 create mode 100644 include/linux/unwind_deferred_types.h
 create mode 100644 kernel/unwind/deferred.c

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4f78a64beb52..59fdf7d9bb1e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -46,6 +46,7 @@
 #include <linux/rv.h>
 #include <linux/uidgid_types.h>
 #include <linux/tracepoint-defs.h>
+#include <linux/unwind_deferred_types.h>
 #include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
@@ -1654,6 +1655,10 @@ struct task_struct {
 	struct user_event_mm		*user_event_mm;
 #endif
 
+#ifdef CONFIG_UNWIND_USER
+	struct unwind_task_info		unwind_info;
+#endif
+
 	/* CPU-specific state of this task: */
 	struct thread_struct		thread;
 
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
new file mode 100644
index 000000000000..5064ebe38c4f
--- /dev/null
+++ b/include/linux/unwind_deferred.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_DEFERRED_H
+#define _LINUX_UNWIND_USER_DEFERRED_H
+
+#include <linux/unwind_user.h>
+#include <linux/unwind_deferred_types.h>
+
+#ifdef CONFIG_UNWIND_USER
+
+void unwind_task_init(struct task_struct *task);
+void unwind_task_free(struct task_struct *task);
+
+int unwind_deferred_trace(struct unwind_stacktrace *trace);
+
+#else /* !CONFIG_UNWIND_USER */
+
+static inline void unwind_task_init(struct task_struct *task) {}
+static inline void unwind_task_free(struct task_struct *task) {}
+
+static inline int unwind_deferred_trace(struct unwind_stacktrace *trace) { return -ENOSYS; }
+
+#endif /* !CONFIG_UNWIND_USER */
+
+#endif /* _LINUX_UNWIND_USER_DEFERRED_H */
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
new file mode 100644
index 000000000000..aa32db574e43
--- /dev/null
+++ b/include/linux/unwind_deferred_types.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
+#define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
+
+struct unwind_task_info {
+	unsigned long		*entries;
+};
+
+#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 1ee8eb11f38b..3341d50c61f2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,6 +105,7 @@
 #include <uapi/linux/pidfd.h>
 #include <linux/pidfs.h>
 #include <linux/tick.h>
+#include <linux/unwind_deferred.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -732,6 +733,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(refcount_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	unwind_task_free(tsk);
 	sched_ext_free(tsk);
 	io_uring_free(tsk);
 	cgroup_free(tsk);
@@ -2135,6 +2137,8 @@ __latent_entropy struct task_struct *copy_process(
 	p->bpf_ctx = NULL;
 #endif
 
+	unwind_task_init(p);
+
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
 	if (retval)
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
index 349ce3677526..6752ac96d7e2 100644
--- a/kernel/unwind/Makefile
+++ b/kernel/unwind/Makefile
@@ -1 +1 @@
- obj-$(CONFIG_UNWIND_USER) += user.o
+ obj-$(CONFIG_UNWIND_USER)		+= user.o deferred.o
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
new file mode 100644
index 000000000000..0bafb95e6336
--- /dev/null
+++ b/kernel/unwind/deferred.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Deferred user space unwinding
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/unwind_deferred.h>
+
+#define UNWIND_MAX_ENTRIES 512
+
+/**
+ * unwind_deferred_trace - Produce a user stacktrace in faultable context
+ * @trace: The descriptor that will store the user stacktrace
+ *
+ * This must be called in a known faultable context (usually when entering
+ * or exiting user space). Depending on the available implementations
+ * the @trace will be loaded with the addresses of the user space stacktrace
+ * if it can be found.
+ *
+ * Return: 0 on success and negative on error
+ *         On success @trace will contain the user space stacktrace
+ */
+int unwind_deferred_trace(struct unwind_stacktrace *trace)
+{
+	struct unwind_task_info *info = &current->unwind_info;
+
+	/* Should always be called from faultable context */
+	might_fault();
+
+	if (current->flags & PF_EXITING)
+		return -EINVAL;
+
+	if (!info->entries) {
+		info->entries = kmalloc_array(UNWIND_MAX_ENTRIES, sizeof(long),
+					      GFP_KERNEL);
+		if (!info->entries)
+			return -ENOMEM;
+	}
+
+	trace->nr = 0;
+	trace->entries = info->entries;
+	unwind_user(trace, UNWIND_MAX_ENTRIES);
+
+	return 0;
+}
+
+void unwind_task_init(struct task_struct *task)
+{
+	struct unwind_task_info *info = &task->unwind_info;
+
+	memset(info, 0, sizeof(*info));
+}
+
+void unwind_task_free(struct task_struct *task)
+{
+	struct unwind_task_info *info = &task->unwind_info;
+
+	kfree(info->entries);
+}
-- 
2.47.2
Re: [PATCH v10 04/14] unwind_user/deferred: Add unwind_deferred_trace()
Posted by Peter Zijlstra 3 months, 3 weeks ago
On Tue, Jun 10, 2025 at 08:54:25PM -0400, Steven Rostedt wrote:

> +#define UNWIND_MAX_ENTRIES 512

The reason this is 512 is so that you end up with a whole page below?

> +int unwind_deferred_trace(struct unwind_stacktrace *trace)
> +{
> +	struct unwind_task_info *info = &current->unwind_info;
> +
> +	/* Should always be called from faultable context */
> +	might_fault();
> +
> +	if (current->flags & PF_EXITING)
> +		return -EINVAL;
> +
> +	if (!info->entries) {
> +		info->entries = kmalloc_array(UNWIND_MAX_ENTRIES, sizeof(long),
> +					      GFP_KERNEL);
Re: [PATCH v10 04/14] unwind_user/deferred: Add unwind_deferred_trace()
Posted by Steven Rostedt 3 months, 3 weeks ago
On Wed, 18 Jun 2025 16:01:11 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Jun 10, 2025 at 08:54:25PM -0400, Steven Rostedt wrote:
> 
> > +#define UNWIND_MAX_ENTRIES 512  
> 
> The reason this is 512 is so that you end up with a whole page below?

Possibly. We could probably even make that configurable. Perhaps just use
sysctl_perf_event_max_contexts_per_stack ?


Josh, any comments about why you picked this number?

-- Steve

> 
> > +int unwind_deferred_trace(struct unwind_stacktrace *trace)
> > +{
> > +	struct unwind_task_info *info = &current->unwind_info;
> > +
> > +	/* Should always be called from faultable context */
> > +	might_fault();
> > +
> > +	if (current->flags & PF_EXITING)
> > +		return -EINVAL;
> > +
> > +	if (!info->entries) {
> > +		info->entries = kmalloc_array(UNWIND_MAX_ENTRIES, sizeof(long),
> > +					      GFP_KERNEL);  
>
Re: [PATCH v10 04/14] unwind_user/deferred: Add unwind_deferred_trace()
Posted by Peter Zijlstra 3 months, 3 weeks ago
On Tue, Jun 10, 2025 at 08:54:25PM -0400, Steven Rostedt wrote:
> +/**
> + * unwind_deferred_trace - Produce a user stacktrace in faultable context
> + * @trace: The descriptor that will store the user stacktrace
> + *
> + * This must be called in a known faultable context (usually when entering
> + * or exiting user space). Depending on the available implementations
> + * the @trace will be loaded with the addresses of the user space stacktrace
> + * if it can be found.

I am confused -- why would we ever want to call this on exiting
user-space, or rather kernel entry?

I thought the whole point was to request a user trace while in-kernel,
and defer that to return-to-user.
Re: [PATCH v10 04/14] unwind_user/deferred: Add unwind_deferred_trace()
Posted by Steven Rostedt 3 months, 3 weeks ago
On Wed, 18 Jun 2025 16:02:47 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Jun 10, 2025 at 08:54:25PM -0400, Steven Rostedt wrote:
> > +/**
> > + * unwind_deferred_trace - Produce a user stacktrace in faultable context
> > + * @trace: The descriptor that will store the user stacktrace
> > + *
> > + * This must be called in a known faultable context (usually when entering
> > + * or exiting user space). Depending on the available implementations
> > + * the @trace will be loaded with the addresses of the user space stacktrace
> > + * if it can be found.  
> 
> I am confused -- why would we ever want to call this on exiting
> user-space, or rather kernel entry?
> 
> I thought the whole point was to request a user trace while in-kernel,
> and defer that to return-to-user.

This code was broken out of the unwind deferred trace to be more stand
alone. Actually, it should be renamed to unwind_faultable_trace() or
something to denote that it must be called from a faultable context.

When Josh made perf use the task_work directly, it used this function to do
the trace as it handled the deferring.

Note, a request from the gcc folks is to add a system call that gives the
user space application a backtrace from its current location. This can be
handy for debugging as it would be similar to how we use dump_stack().

This function would be used for that.

-- Steve
Re: [PATCH v10 04/14] unwind_user/deferred: Add unwind_deferred_trace()
Posted by Peter Zijlstra 3 months, 3 weeks ago
On Wed, Jun 18, 2025 at 11:29:39AM -0400, Steven Rostedt wrote:

> Note, a request from the gcc folks is to add a system call that gives the
> user space application a backtrace from its current location. This can be
> handy for debugging as it would be similar to how we use dump_stack().

That makes very little sense to me; apps can typically unwind themselves
just fine, no? In fact, they can use DWARFs and all that.

Also, how about we don't make thing complicated and not confuse comments
with things like this? Focus on the deferred stuff (that's what these
patches are about) -- and then return-to-user is the one and only place
that makes sense.
Re: [PATCH v10 04/14] unwind_user/deferred: Add unwind_deferred_trace()
Posted by Steven Rostedt 3 months, 3 weeks ago
On Thu, 19 Jun 2025 09:54:17 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Jun 18, 2025 at 11:29:39AM -0400, Steven Rostedt wrote:
> 
> > Note, a request from the gcc folks is to add a system call that gives the
> > user space application a backtrace from its current location. This can be
> > handy for debugging as it would be similar to how we use dump_stack().  
> 
> That makes very little sense to me; apps can typically unwind themselves
> just fine, no? In fact, they can use DWARFs and all that.

Not really. It can in gdb, but doing it from a running app means that
the app needs a full parser, and access to the elf file it's running.

> 
> Also, how about we don't make thing complicated and not confuse comments
> with things like this? Focus on the deferred stuff (that's what these
> patches are about) -- and then return-to-user is the one and only place
> that makes sense.

The change log had:

   Add a function that must be called inside a faultable context that will
   retrieve a user space stack trace. The function unwind_deferred_trace()
   can be called by a tracer when a task is about to enter user space, or has
   just come back from user space and has interrupts enabled.

It doesn't mention the backtrace thing. It only makes a statement that
it needs to be done in a faultable context. I renamed the function to:

   unwind_user_faultable()

And updated the change log to:

    unwind_user/deferred: Add unwind_user_faultable()
    
    Add a new API to retrieve a user space callstack called
    unwind_user_faultable(). The difference between this user space stack
    tracer from the current user space stack tracer is that this must be
    called from faultable context as it may use routines to access user space
    data that needs to be faulted in.
    
    It can be safely called from entering or exiting a system call as the code

The explanation is that it must be called in faultable context. It
doesn't add any more policy that that (like it having to be deferred).

-- Steve
Re: [PATCH v10 04/14] unwind_user/deferred: Add unwind_deferred_trace()
Posted by Peter Zijlstra 3 months, 3 weeks ago
On Tue, Jun 10, 2025 at 08:54:25PM -0400, Steven Rostedt wrote:
> diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
> index 349ce3677526..6752ac96d7e2 100644
> --- a/kernel/unwind/Makefile
> +++ b/kernel/unwind/Makefile
> @@ -1 +1 @@
> - obj-$(CONFIG_UNWIND_USER) += user.o
> + obj-$(CONFIG_UNWIND_USER)		+= user.o deferred.o

We really needed that extra whitespace? :-)
Re: [PATCH v10 04/14] unwind_user/deferred: Add unwind_deferred_trace()
Posted by Steven Rostedt 3 months, 3 weeks ago
On Wed, 18 Jun 2025 15:59:07 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Jun 10, 2025 at 08:54:25PM -0400, Steven Rostedt wrote:
> > diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
> > index 349ce3677526..6752ac96d7e2 100644
> > --- a/kernel/unwind/Makefile
> > +++ b/kernel/unwind/Makefile
> > @@ -1 +1 @@
> > - obj-$(CONFIG_UNWIND_USER) += user.o
> > + obj-$(CONFIG_UNWIND_USER)		+= user.o deferred.o  
> 
> We really needed that extra whitespace? :-)

Oops, I have no idea how that happened. Especially since emacs doesn't do
tabs well.

-- Steve
Re: [PATCH v10 04/14] unwind_user/deferred: Add unwind_deferred_trace()
Posted by Steven Rostedt 3 months, 3 weeks ago
On Wed, 18 Jun 2025 11:20:15 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> > > +++ b/kernel/unwind/Makefile
> > > @@ -1 +1 @@
> > > - obj-$(CONFIG_UNWIND_USER) += user.o
> > > + obj-$(CONFIG_UNWIND_USER)		+= user.o deferred.o    
> > 
> > We really needed that extra whitespace? :-)  
> 
> Oops, I have no idea how that happened. Especially since emacs doesn't do
> tabs well.

I will replace the two tabs with a single tab which will still add space,
as it replaces a single space with a tab. But tabs appear to be more
consistent with other Makefiles.

-- Steve
Re: [PATCH v10 04/14] unwind_user/deferred: Add unwind_deferred_trace()
Posted by Peter Zijlstra 3 months, 3 weeks ago
On Tue, Jun 10, 2025 at 08:54:25PM -0400, Steven Rostedt wrote:
> From: Steven Rostedt <rostedt@goodmis.org>
> 
> Add a function that must be called inside a faultable context that will
> retrieve a user space stack trace. The function unwind_deferred_trace()
> can be called by a tracer when a task is about to enter user space, or has
> just come back from user space and has interrupts enabled.

This is word salad; I really can't make much of it.

Let me go read the patch to see if that makes more sense.