[PATCH v10 05/14] unwind_user/deferred: Add unwind cache

Steven Rostedt posted 14 patches 4 months ago
There is a newer version of this series
[PATCH v10 05/14] unwind_user/deferred: Add unwind cache
Posted by Steven Rostedt 4 months ago
From: Josh Poimboeuf <jpoimboe@kernel.org>

Cache the results of the unwind to ensure the unwind is only performed
once, even when called by multiple tracers.

The cache nr_entries gets cleared every time the task exits the kernel.
When a stacktrace is requested, nr_entries gets set to the number of
entries in the stacktrace. If another stacktrace is requested, if
nr_entries is not zero, then it contains the same stacktrace that would be
retrieved so it is not processed again and the entries is given to the
caller.

Co-developed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/entry-common.h          |  2 ++
 include/linux/unwind_deferred.h       |  8 ++++++++
 include/linux/unwind_deferred_types.h |  7 ++++++-
 kernel/unwind/deferred.c              | 26 ++++++++++++++++++++------
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index f94f3fdf15fc..6e850c9d3f0c 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -12,6 +12,7 @@
 #include <linux/resume_user_mode.h>
 #include <linux/tick.h>
 #include <linux/kmsan.h>
+#include <linux/unwind_deferred.h>
 
 #include <asm/entry-common.h>
 #include <asm/syscall.h>
@@ -362,6 +363,7 @@ static __always_inline void exit_to_user_mode(void)
 	lockdep_hardirqs_on_prepare();
 	instrumentation_end();
 
+	unwind_exit_to_user_mode();
 	user_enter_irqoff();
 	arch_exit_to_user_mode();
 	lockdep_hardirqs_on(CALLER_ADDR0);
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index 5064ebe38c4f..7d6cb2ffd084 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -12,6 +12,12 @@ void unwind_task_free(struct task_struct *task);
 
 int unwind_deferred_trace(struct unwind_stacktrace *trace);
 
+static __always_inline void unwind_exit_to_user_mode(void)
+{
+	if (unlikely(current->unwind_info.cache))
+		current->unwind_info.cache->nr_entries = 0;
+}
+
 #else /* !CONFIG_UNWIND_USER */
 
 static inline void unwind_task_init(struct task_struct *task) {}
@@ -19,6 +25,8 @@ static inline void unwind_task_free(struct task_struct *task) {}
 
 static inline int unwind_deferred_trace(struct unwind_stacktrace *trace) { return -ENOSYS; }
 
+static inline void unwind_exit_to_user_mode(void) {}
+
 #endif /* !CONFIG_UNWIND_USER */
 
 #endif /* _LINUX_UNWIND_USER_DEFERRED_H */
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index aa32db574e43..db5b54b18828 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -2,8 +2,13 @@
 #ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
 #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
 
+struct unwind_cache {
+	unsigned int		nr_entries;
+	unsigned long		entries[];
+};
+
 struct unwind_task_info {
-	unsigned long		*entries;
+	struct unwind_cache	*cache;
 };
 
 #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index 0bafb95e6336..e3913781c8c6 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -24,6 +24,7 @@
 int unwind_deferred_trace(struct unwind_stacktrace *trace)
 {
 	struct unwind_task_info *info = &current->unwind_info;
+	struct unwind_cache *cache;
 
 	/* Should always be called from faultable context */
 	might_fault();
@@ -31,17 +32,30 @@ int unwind_deferred_trace(struct unwind_stacktrace *trace)
 	if (current->flags & PF_EXITING)
 		return -EINVAL;
 
-	if (!info->entries) {
-		info->entries = kmalloc_array(UNWIND_MAX_ENTRIES, sizeof(long),
-					      GFP_KERNEL);
-		if (!info->entries)
+	if (!info->cache) {
+		info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
+				      GFP_KERNEL);
+		if (!info->cache)
 			return -ENOMEM;
 	}
 
+	cache = info->cache;
+	trace->entries = cache->entries;
+
+	if (cache->nr_entries) {
+		/*
+		 * The user stack has already been previously unwound in this
+		 * entry context.  Skip the unwind and use the cache.
+		 */
+		trace->nr = cache->nr_entries;
+		return 0;
+	}
+
 	trace->nr = 0;
-	trace->entries = info->entries;
 	unwind_user(trace, UNWIND_MAX_ENTRIES);
 
+	cache->nr_entries = trace->nr;
+
 	return 0;
 }
 
@@ -56,5 +70,5 @@ void unwind_task_free(struct task_struct *task)
 {
 	struct unwind_task_info *info = &task->unwind_info;
 
-	kfree(info->entries);
+	kfree(info->cache);
 }
-- 
2.47.2
Re: [PATCH v10 05/14] unwind_user/deferred: Add unwind cache
Posted by Peter Zijlstra 3 months, 3 weeks ago
> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> index f94f3fdf15fc..6e850c9d3f0c 100644
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -12,6 +12,7 @@
>  #include <linux/resume_user_mode.h>
>  #include <linux/tick.h>
>  #include <linux/kmsan.h>
> +#include <linux/unwind_deferred.h>
>  
>  #include <asm/entry-common.h>
>  #include <asm/syscall.h>
> @@ -362,6 +363,7 @@ static __always_inline void exit_to_user_mode(void)
>  	lockdep_hardirqs_on_prepare();
>  	instrumentation_end();
>  
> +	unwind_exit_to_user_mode();

So I was expecting this to do the actual unwind, and was about to go
yell this is the wrong place for that.

But this is not that. Perhaps find a better name like:
unwind_clear_cache() or so?

>  	user_enter_irqoff();
>  	arch_exit_to_user_mode();
>  	lockdep_hardirqs_on(CALLER_ADDR0);


> diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
> index aa32db574e43..db5b54b18828 100644
> --- a/include/linux/unwind_deferred_types.h
> +++ b/include/linux/unwind_deferred_types.h
> @@ -2,8 +2,13 @@
>  #ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
>  #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
>  
> +struct unwind_cache {
> +	unsigned int		nr_entries;
> +	unsigned long		entries[];
> +};
> +
>  struct unwind_task_info {
> -	unsigned long		*entries;
> +	struct unwind_cache	*cache;
>  };
>  
>  #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
> diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
> index 0bafb95e6336..e3913781c8c6 100644
> --- a/kernel/unwind/deferred.c
> +++ b/kernel/unwind/deferred.c
> @@ -24,6 +24,7 @@
>  int unwind_deferred_trace(struct unwind_stacktrace *trace)
>  {
>  	struct unwind_task_info *info = &current->unwind_info;
> +	struct unwind_cache *cache;
>  
>  	/* Should always be called from faultable context */
>  	might_fault();
> @@ -31,17 +32,30 @@ int unwind_deferred_trace(struct unwind_stacktrace *trace)
>  	if (current->flags & PF_EXITING)
>  		return -EINVAL;
>  
> -	if (!info->entries) {
> -		info->entries = kmalloc_array(UNWIND_MAX_ENTRIES, sizeof(long),
> -					      GFP_KERNEL);
> -		if (!info->entries)
> +	if (!info->cache) {
> +		info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
> +				      GFP_KERNEL);

And now you're one 'long' larger than a page. Surely that's a crap size
for an allocator?
Re: [PATCH v10 05/14] unwind_user/deferred: Add unwind cache
Posted by Steven Rostedt 3 months, 3 weeks ago
On Wed, 18 Jun 2025 16:13:45 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> > diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> > index f94f3fdf15fc..6e850c9d3f0c 100644
> > --- a/include/linux/entry-common.h
> > +++ b/include/linux/entry-common.h
> > @@ -12,6 +12,7 @@
> >  #include <linux/resume_user_mode.h>
> >  #include <linux/tick.h>
> >  #include <linux/kmsan.h>
> > +#include <linux/unwind_deferred.h>
> >  
> >  #include <asm/entry-common.h>
> >  #include <asm/syscall.h>
> > @@ -362,6 +363,7 @@ static __always_inline void exit_to_user_mode(void)
> >  	lockdep_hardirqs_on_prepare();
> >  	instrumentation_end();
> >  
> > +	unwind_exit_to_user_mode();  
> 
> So I was expecting this to do the actual unwind, and was about to go
> yell this is the wrong place for that.
> 
> But this is not that. Perhaps find a better name like:
> unwind_clear_cache() or so?

Sure.

How about unwind_reset_info()?

As it's not going to just clear the cache but also reset the trace info
(like the timestamp and such).


> 
> >  	user_enter_irqoff();
> >  	arch_exit_to_user_mode();
> >  	lockdep_hardirqs_on(CALLER_ADDR0);  
> 
> 
> > diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
> > index aa32db574e43..db5b54b18828 100644
> > --- a/include/linux/unwind_deferred_types.h
> > +++ b/include/linux/unwind_deferred_types.h
> > @@ -2,8 +2,13 @@
> >  #ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
> >  #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
> >  
> > +struct unwind_cache {
> > +	unsigned int		nr_entries;
> > +	unsigned long		entries[];
> > +};
> > +
> >  struct unwind_task_info {
> > -	unsigned long		*entries;
> > +	struct unwind_cache	*cache;
> >  };
> >  
> >  #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
> > diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
> > index 0bafb95e6336..e3913781c8c6 100644
> > --- a/kernel/unwind/deferred.c
> > +++ b/kernel/unwind/deferred.c
> > @@ -24,6 +24,7 @@
> >  int unwind_deferred_trace(struct unwind_stacktrace *trace)
> >  {
> >  	struct unwind_task_info *info = &current->unwind_info;
> > +	struct unwind_cache *cache;
> >  
> >  	/* Should always be called from faultable context */
> >  	might_fault();
> > @@ -31,17 +32,30 @@ int unwind_deferred_trace(struct unwind_stacktrace *trace)
> >  	if (current->flags & PF_EXITING)
> >  		return -EINVAL;
> >  
> > -	if (!info->entries) {
> > -		info->entries = kmalloc_array(UNWIND_MAX_ENTRIES, sizeof(long),
> > -					      GFP_KERNEL);
> > -		if (!info->entries)
> > +	if (!info->cache) {
> > +		info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
> > +				      GFP_KERNEL);  
> 
> And now you're one 'long' larger than a page. Surely that's a crap size
> for an allocator?

Bah, Ingo suggested to put the counter in the allocation and I didn't think
about the size going over the page. Good catch!

Since it can make one per task, it may be good to make this into a
kmemcache.

-- Steve
Re: [PATCH v10 05/14] unwind_user/deferred: Add unwind cache
Posted by Peter Zijlstra 3 months, 3 weeks ago
On Wed, Jun 18, 2025 at 11:33:59AM -0400, Steven Rostedt wrote:
> On Wed, 18 Jun 2025 16:13:45 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:

> > > +		info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
> > > +				      GFP_KERNEL);  
> > 
> > And now you're one 'long' larger than a page. Surely that's a crap size
> > for an allocator?
> 
> Bah, Ingo suggested to put the counter in the allocation and I didn't think
> about the size going over the page. Good catch!
> 
> Since it can make one per task, it may be good to make this into a
> kmemcache.

Well, the trivial solution is to make it 511 and call it a day. Don't
make things complicated if you don't have to.
Re: [PATCH v10 05/14] unwind_user/deferred: Add unwind cache
Posted by Steven Rostedt 3 months, 3 weeks ago
On Thu, 19 Jun 2025 09:56:11 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> Well, the trivial solution is to make it 511 and call it a day. Don't
> make things complicated if you don't have to.

I don't know if this is more complicated, but it should make it fit
nicely in a page:

  /* Make the cache fit in a page */
  #define UNWIND_MAX_ENTRIES                                      \
        ((PAGE_SIZE - sizeof(struct unwind_cache)) / sizeof(long))

-- Steve
Re: [PATCH v10 05/14] unwind_user/deferred: Add unwind cache
Posted by Peter Zijlstra 3 months, 3 weeks ago
On Thu, Jun 19, 2025 at 04:47:14AM -0400, Steven Rostedt wrote:
> On Thu, 19 Jun 2025 09:56:11 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > Well, the trivial solution is to make it 511 and call it a day. Don't
> > make things complicated if you don't have to.
> 
> I don't know if this is more complicated, but it should make it fit
> nicely in a page:
> 
>   /* Make the cache fit in a page */
>   #define UNWIND_MAX_ENTRIES                                      \
>         ((PAGE_SIZE - sizeof(struct unwind_cache)) / sizeof(long))

Right, that's the fancy way of spelling 511 :-) Except on 32bit, where
it now spells 1023 instead.

Did you want that bitness difference?

Also, you ready for some *reaaally* big numbers on Power/ARM with 64K
pages? :-)
Re: [PATCH v10 05/14] unwind_user/deferred: Add unwind cache
Posted by Steven Rostedt 3 months, 3 weeks ago

On June 19, 2025 5:04:36 AM EDT, Peter Zijlstra <peterz@infradead.org> wrote:
>On Thu, Jun 19, 2025 at 04:47:14AM -0400, Steven Rostedt wrote:
>> On Thu, 19 Jun 2025 09:56:11 +0200
>> Peter Zijlstra <peterz@infradead.org> wrote:
>> 
>> > Well, the trivial solution is to make it 511 and call it a day. Don't
>> > make things complicated if you don't have to.
>> 
>> I don't know if this is more complicated, but it should make it fit
>> nicely in a page:
>> 
>>   /* Make the cache fit in a page */
>>   #define UNWIND_MAX_ENTRIES                                      \
>>         ((PAGE_SIZE - sizeof(struct unwind_cache)) / sizeof(long))
>
>Right, that's the fancy way of spelling 511 :-) Except on 32bit, where
>it now spells 1023 instead.
>
>Did you want that bitness difference?
>
>Also, you ready for some *reaaally* big numbers on Power/ARM with 64K
>pages? :-)

Bah, yeah, I need to use a size and not just PAGE_SIZE.

 Thanks,

-- Steve