[v2] mm: slub: avoid wake up kswapd in set_track_prepare

[PATCH v2] mm: slub: avoid wake up kswapd in set_track_prepare

Posted by yangshiguang1011@163.com 1 month, 3 weeks ago

From: yangshiguang <yangshiguang@xiaomi.com>

From: yangshiguang <yangshiguang@xiaomi.com>

set_track_prepare() can incur lock recursion.
The issue is that it is called from hrtimer_start_range_ns
holding the per_cpu(hrtimer_bases)[n].lock, but when enabled
CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare,
and try to hold the per_cpu(hrtimer_bases)[n].lock.

So avoid waking up kswapd.The oops looks something like:

BUG: spinlock recursion on CPU#3, swapper/3/0
 lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3
Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT)
Call trace:
spin_bug+0x0
_raw_spin_lock_irqsave+0x80
hrtimer_try_to_cancel+0x94
task_contending+0x10c
enqueue_dl_entity+0x2a4
dl_server_start+0x74
enqueue_task_fair+0x568
enqueue_task+0xac
do_activate_task+0x14c
ttwu_do_activate+0xcc
try_to_wake_up+0x6c8
default_wake_function+0x20
autoremove_wake_function+0x1c
__wake_up+0xac
wakeup_kswapd+0x19c
wake_all_kswapds+0x78
__alloc_pages_slowpath+0x1ac
__alloc_pages_noprof+0x298
stack_depot_save_flags+0x6b0
stack_depot_save+0x14
set_track_prepare+0x5c
___slab_alloc+0xccc
__kmalloc_cache_noprof+0x470
__set_page_owner+0x2bc
post_alloc_hook[jt]+0x1b8
prep_new_page+0x28
get_page_from_freelist+0x1edc
__alloc_pages_noprof+0x13c
alloc_slab_page+0x244
allocate_slab+0x7c
___slab_alloc+0x8e8
kmem_cache_alloc_noprof+0x450
debug_objects_fill_pool+0x22c
debug_object_activate+0x40
enqueue_hrtimer[jt]+0xdc
hrtimer_start_range_ns+0x5f8
...

Signed-off-by: yangshiguang <yangshiguang@xiaomi.com>
Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects")
---
v1 -> v2:
    propagate gfp flags to set_track_prepare()

[1]https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com/
---
 mm/slub.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 30003763d224..dba905bf1e03 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object,
 }
 
 #ifdef CONFIG_STACKDEPOT
-static noinline depot_stack_handle_t set_track_prepare(void)
+static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
 {
 	depot_stack_handle_t handle;
 	unsigned long entries[TRACK_ADDRS_COUNT];
 	unsigned int nr_entries;
+	gfp_flags &= GFP_NOWAIT;
 
 	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
-	handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
+	handle = stack_depot_save(entries, nr_entries, gfp_flags);
 
 	return handle;
 }
 #else
-static inline depot_stack_handle_t set_track_prepare(void)
+static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
 {
 	return 0;
 }
@@ -996,9 +997,9 @@ static void set_track_update(struct kmem_cache *s, void *object,
 }
 
 static __always_inline void set_track(struct kmem_cache *s, void *object,
-				      enum track_item alloc, unsigned long addr)
+				      enum track_item alloc, unsigned long addr, gfp_t gfp_flags)
 {
-	depot_stack_handle_t handle = set_track_prepare();
+	depot_stack_handle_t handle = set_track_prepare(gfp_flags);
 
 	set_track_update(s, object, alloc, addr, handle);
 }
@@ -1921,9 +1922,9 @@ static inline bool free_debug_processing(struct kmem_cache *s,
 static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
 static inline int check_object(struct kmem_cache *s, struct slab *slab,
 			void *object, u8 val) { return 1; }
-static inline depot_stack_handle_t set_track_prepare(void) { return 0; }
+static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; }
 static inline void set_track(struct kmem_cache *s, void *object,
-			     enum track_item alloc, unsigned long addr) {}
+			     enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {}
 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
 					struct slab *slab) {}
 static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
@@ -3878,7 +3879,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 			 * tracking info and return the object.
 			 */
 			if (s->flags & SLAB_STORE_USER)
-				set_track(s, freelist, TRACK_ALLOC, addr);
+				set_track(s, freelist, TRACK_ALLOC, addr, gfpflags);
 
 			return freelist;
 		}
@@ -3910,7 +3911,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 			goto new_objects;
 
 		if (s->flags & SLAB_STORE_USER)
-			set_track(s, freelist, TRACK_ALLOC, addr);
+			set_track(s, freelist, TRACK_ALLOC, addr, gfpflags);
 
 		return freelist;
 	}
@@ -4422,7 +4423,7 @@ static noinline void free_to_partial_list(
 	depot_stack_handle_t handle = 0;
 
 	if (s->flags & SLAB_STORE_USER)
-		handle = set_track_prepare();
+		handle = set_track_prepare(GFP_NOWAIT);
 
 	spin_lock_irqsave(&n->list_lock, flags);
 
-- 
2.43.0

Re: [PATCH v2] mm: slub: avoid wake up kswapd in set_track_prepare

Posted by Harry Yoo 1 month, 2 weeks ago

On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote:
> From: yangshiguang <yangshiguang@xiaomi.com>
> 
> From: yangshiguang <yangshiguang@xiaomi.com>
> 
> set_track_prepare() can incur lock recursion.
> The issue is that it is called from hrtimer_start_range_ns
> holding the per_cpu(hrtimer_bases)[n].lock, but when enabled
> CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare,
> and try to hold the per_cpu(hrtimer_bases)[n].lock.
> 
> So avoid waking up kswapd.The oops looks something like:

Hi yangshiguang, 

In the next revision, could you please elaborate the commit message
to reflect how this change avoids waking up kswapd?

> BUG: spinlock recursion on CPU#3, swapper/3/0
>  lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3
> Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT)
> Call trace:
> spin_bug+0x0
> _raw_spin_lock_irqsave+0x80
> hrtimer_try_to_cancel+0x94
> task_contending+0x10c
> enqueue_dl_entity+0x2a4
> dl_server_start+0x74
> enqueue_task_fair+0x568
> enqueue_task+0xac
> do_activate_task+0x14c
> ttwu_do_activate+0xcc
> try_to_wake_up+0x6c8
> default_wake_function+0x20
> autoremove_wake_function+0x1c
> __wake_up+0xac
> wakeup_kswapd+0x19c
> wake_all_kswapds+0x78
> __alloc_pages_slowpath+0x1ac
> __alloc_pages_noprof+0x298
> stack_depot_save_flags+0x6b0
> stack_depot_save+0x14
> set_track_prepare+0x5c
> ___slab_alloc+0xccc
> __kmalloc_cache_noprof+0x470
> __set_page_owner+0x2bc
> post_alloc_hook[jt]+0x1b8
> prep_new_page+0x28
> get_page_from_freelist+0x1edc
> __alloc_pages_noprof+0x13c
> alloc_slab_page+0x244
> allocate_slab+0x7c
> ___slab_alloc+0x8e8
> kmem_cache_alloc_noprof+0x450
> debug_objects_fill_pool+0x22c
> debug_object_activate+0x40
> enqueue_hrtimer[jt]+0xdc
> hrtimer_start_range_ns+0x5f8
> ...
> 
> Signed-off-by: yangshiguang <yangshiguang@xiaomi.com>
> Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects")
> ---
> v1 -> v2:
>     propagate gfp flags to set_track_prepare()
> 
> [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com 
> ---
>  mm/slub.c | 21 +++++++++++----------
>  1 file changed, 11 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 30003763d224..dba905bf1e03 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object,
>  }
>  
>  #ifdef CONFIG_STACKDEPOT
> -static noinline depot_stack_handle_t set_track_prepare(void)
> +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
>  {
>  	depot_stack_handle_t handle;
>  	unsigned long entries[TRACK_ADDRS_COUNT];
>  	unsigned int nr_entries;
> +	gfp_flags &= GFP_NOWAIT;

Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows
direct reclamation?

>  	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
> -	handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
> +	handle = stack_depot_save(entries, nr_entries, gfp_flags);
>  
>  	return handle;
>  }
>  #else
> -static inline depot_stack_handle_t set_track_prepare(void)
> +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
>  {
>  	return 0;
>  }
> @@ -4422,7 +4423,7 @@ static noinline void free_to_partial_list(
>  	depot_stack_handle_t handle = 0;
>  
>  	if (s->flags & SLAB_STORE_USER)
> -		handle = set_track_prepare();
> +		handle = set_track_prepare(GFP_NOWAIT);

I don't think it is safe to use GFP_NOWAIT during free?

Let's say fill_pool() -> kmem_alloc_batch() fails to allocate an object
and then free_object_list() frees allocated objects,
set_track_prepare(GFP_NOWAIT) may wake up kswapd, and the same deadlock
you reported will occur.

So I think it should be __GFP_NOWARN?

-- 
Cheers,
Harry / Hyeonggon

Re:Re: [PATCH v2] mm: slub: avoid wake up kswapd in set_track_prepare

Posted by yangshiguang 1 month, 2 weeks ago


At 2025-08-16 16:25:25, "Harry Yoo" <harry.yoo@oracle.com> wrote:
>On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote:
>> From: yangshiguang <yangshiguang@xiaomi.com>
>> 
>> From: yangshiguang <yangshiguang@xiaomi.com>
>> 
>> set_track_prepare() can incur lock recursion.
>> The issue is that it is called from hrtimer_start_range_ns
>> holding the per_cpu(hrtimer_bases)[n].lock, but when enabled
>> CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare,
>> and try to hold the per_cpu(hrtimer_bases)[n].lock.
>> 
>> So avoid waking up kswapd.The oops looks something like:
>
>Hi yangshiguang, 
>
>In the next revision, could you please elaborate the commit message
>to reflect how this change avoids waking up kswapd?
>

of course. Thanks for the reminder.

>> BUG: spinlock recursion on CPU#3, swapper/3/0
>>  lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3
>> Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT)
>> Call trace:
>> spin_bug+0x0
>> _raw_spin_lock_irqsave+0x80
>> hrtimer_try_to_cancel+0x94
>> task_contending+0x10c
>> enqueue_dl_entity+0x2a4
>> dl_server_start+0x74
>> enqueue_task_fair+0x568
>> enqueue_task+0xac
>> do_activate_task+0x14c
>> ttwu_do_activate+0xcc
>> try_to_wake_up+0x6c8
>> default_wake_function+0x20
>> autoremove_wake_function+0x1c
>> __wake_up+0xac
>> wakeup_kswapd+0x19c
>> wake_all_kswapds+0x78
>> __alloc_pages_slowpath+0x1ac
>> __alloc_pages_noprof+0x298
>> stack_depot_save_flags+0x6b0
>> stack_depot_save+0x14
>> set_track_prepare+0x5c
>> ___slab_alloc+0xccc
>> __kmalloc_cache_noprof+0x470
>> __set_page_owner+0x2bc
>> post_alloc_hook[jt]+0x1b8
>> prep_new_page+0x28
>> get_page_from_freelist+0x1edc
>> __alloc_pages_noprof+0x13c
>> alloc_slab_page+0x244
>> allocate_slab+0x7c
>> ___slab_alloc+0x8e8
>> kmem_cache_alloc_noprof+0x450
>> debug_objects_fill_pool+0x22c
>> debug_object_activate+0x40
>> enqueue_hrtimer[jt]+0xdc
>> hrtimer_start_range_ns+0x5f8
>> ...
>> 
>> Signed-off-by: yangshiguang <yangshiguang@xiaomi.com>
>> Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects")
>> ---
>> v1 -> v2:
>>     propagate gfp flags to set_track_prepare()
>> 
>> [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com 
>> ---
>>  mm/slub.c | 21 +++++++++++----------
>>  1 file changed, 11 insertions(+), 10 deletions(-)
>> 
>> diff --git a/mm/slub.c b/mm/slub.c
>> index 30003763d224..dba905bf1e03 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object,
>>  }
>>  
>>  #ifdef CONFIG_STACKDEPOT
>> -static noinline depot_stack_handle_t set_track_prepare(void)
>> +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
>>  {
>>  	depot_stack_handle_t handle;
>>  	unsigned long entries[TRACK_ADDRS_COUNT];
>>  	unsigned int nr_entries;
>> +	gfp_flags &= GFP_NOWAIT;
>
>Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows
>direct reclamation?
>

Hi Harry,

The original allocation is GFP_NOWAIT.
So I think it's better not to increase the allocation cost here.


>>  	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
>> -	handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
>> +	handle = stack_depot_save(entries, nr_entries, gfp_flags);
>>  
>>  	return handle;
>>  }
>>  #else
>> -static inline depot_stack_handle_t set_track_prepare(void)
>> +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
>>  {
>>  	return 0;
>>  }
>> @@ -4422,7 +4423,7 @@ static noinline void free_to_partial_list(
>>  	depot_stack_handle_t handle = 0;
>>  
>>  	if (s->flags & SLAB_STORE_USER)
>> -		handle = set_track_prepare();
>> +		handle = set_track_prepare(GFP_NOWAIT);
>
>I don't think it is safe to use GFP_NOWAIT during free?
>
>Let's say fill_pool() -> kmem_alloc_batch() fails to allocate an object
>and then free_object_list() frees allocated objects,
>set_track_prepare(GFP_NOWAIT) may wake up kswapd, and the same deadlock
>you reported will occur.
>
>So I think it should be __GFP_NOWARN?
>

Yes, this ensures safety.

>-- 
>Cheers,
>Harry / Hyeonggon

Re: Re: [PATCH v2] mm: slub: avoid wake up kswapd in set_track_prepare

Posted by Harry Yoo 1 month, 2 weeks ago

On Sat, Aug 16, 2025 at 06:05:15PM +0800, yangshiguang wrote:
> 
> 
> At 2025-08-16 16:25:25, "Harry Yoo" <harry.yoo@oracle.com> wrote:
> >On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote:
> >> From: yangshiguang <yangshiguang@xiaomi.com>
> >> 
> >> From: yangshiguang <yangshiguang@xiaomi.com>
> >> 
> >> set_track_prepare() can incur lock recursion.
> >> The issue is that it is called from hrtimer_start_range_ns
> >> holding the per_cpu(hrtimer_bases)[n].lock, but when enabled
> >> CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare,
> >> and try to hold the per_cpu(hrtimer_bases)[n].lock.
> >> 
> >> So avoid waking up kswapd.The oops looks something like:
> >
> >Hi yangshiguang, 
> >
> >In the next revision, could you please elaborate the commit message
> >to reflect how this change avoids waking up kswapd?
> >
> 
> of course. Thanks for the reminder.
> 
> >> BUG: spinlock recursion on CPU#3, swapper/3/0
> >>  lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3
> >> Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT)
> >> Call trace:
> >> spin_bug+0x0
> >> _raw_spin_lock_irqsave+0x80
> >> hrtimer_try_to_cancel+0x94
> >> task_contending+0x10c
> >> enqueue_dl_entity+0x2a4
> >> dl_server_start+0x74
> >> enqueue_task_fair+0x568
> >> enqueue_task+0xac
> >> do_activate_task+0x14c
> >> ttwu_do_activate+0xcc
> >> try_to_wake_up+0x6c8
> >> default_wake_function+0x20
> >> autoremove_wake_function+0x1c
> >> __wake_up+0xac
> >> wakeup_kswapd+0x19c
> >> wake_all_kswapds+0x78
> >> __alloc_pages_slowpath+0x1ac
> >> __alloc_pages_noprof+0x298
> >> stack_depot_save_flags+0x6b0
> >> stack_depot_save+0x14
> >> set_track_prepare+0x5c
> >> ___slab_alloc+0xccc
> >> __kmalloc_cache_noprof+0x470
> >> __set_page_owner+0x2bc
> >> post_alloc_hook[jt]+0x1b8
> >> prep_new_page+0x28
> >> get_page_from_freelist+0x1edc
> >> __alloc_pages_noprof+0x13c
> >> alloc_slab_page+0x244
> >> allocate_slab+0x7c
> >> ___slab_alloc+0x8e8
> >> kmem_cache_alloc_noprof+0x450
> >> debug_objects_fill_pool+0x22c
> >> debug_object_activate+0x40
> >> enqueue_hrtimer[jt]+0xdc
> >> hrtimer_start_range_ns+0x5f8
> >> ...
> >> 
> >> Signed-off-by: yangshiguang <yangshiguang@xiaomi.com>
> >> Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects")
> >> ---
> >> v1 -> v2:
> >>     propagate gfp flags to set_track_prepare()
> >> 
> >> [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com 
> >> ---
> >>  mm/slub.c | 21 +++++++++++----------
> >>  1 file changed, 11 insertions(+), 10 deletions(-)
> >> 
> >> diff --git a/mm/slub.c b/mm/slub.c
> >> index 30003763d224..dba905bf1e03 100644
> >> --- a/mm/slub.c
> >> +++ b/mm/slub.c
> >> @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object,
> >>  }
> >>  
> >>  #ifdef CONFIG_STACKDEPOT
> >> -static noinline depot_stack_handle_t set_track_prepare(void)
> >> +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
> >>  {
> >>  	depot_stack_handle_t handle;
> >>  	unsigned long entries[TRACK_ADDRS_COUNT];
> >>  	unsigned int nr_entries;
> >> +	gfp_flags &= GFP_NOWAIT;
> >
> >Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows
> >direct reclamation?
> >
> 
> Hi Harry,
> 
> The original allocation is GFP_NOWAIT.
> So I think it's better not to increase the allocation cost here.

I don't think the allocation cost is important here, because collecting
a stack trace for each alloc/free is quite slow anyway. And we don't really
care about performance in debug caches (it isn't designed to be
performant).

I think it was GFP_NOWAIT because it was considered safe without
regard to the GFP flags passed, rather than due to performance
considerations.

-- 
Cheers,
Harry / Hyeonggon

Re:Re: Re: [PATCH v2] mm: slub: avoid wake up kswapd in set_track_prepare

Posted by yangshiguang 1 month, 2 weeks ago


At 2025-08-16 18:46:12, "Harry Yoo" <harry.yoo@oracle.com> wrote:
>On Sat, Aug 16, 2025 at 06:05:15PM +0800, yangshiguang wrote:
>> 
>> 
>> At 2025-08-16 16:25:25, "Harry Yoo" <harry.yoo@oracle.com> wrote:
>> >On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote:
>> >> From: yangshiguang <yangshiguang@xiaomi.com>
>> >> 
>> >> From: yangshiguang <yangshiguang@xiaomi.com>
>> >> 
>> >> set_track_prepare() can incur lock recursion.
>> >> The issue is that it is called from hrtimer_start_range_ns
>> >> holding the per_cpu(hrtimer_bases)[n].lock, but when enabled
>> >> CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare,
>> >> and try to hold the per_cpu(hrtimer_bases)[n].lock.
>> >> 
>> >> So avoid waking up kswapd.The oops looks something like:
>> >
>> >Hi yangshiguang, 
>> >
>> >In the next revision, could you please elaborate the commit message
>> >to reflect how this change avoids waking up kswapd?
>> >
>> 
>> of course. Thanks for the reminder.
>> 
>> >> BUG: spinlock recursion on CPU#3, swapper/3/0
>> >>  lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3
>> >> Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT)
>> >> Call trace:
>> >> spin_bug+0x0
>> >> _raw_spin_lock_irqsave+0x80
>> >> hrtimer_try_to_cancel+0x94
>> >> task_contending+0x10c
>> >> enqueue_dl_entity+0x2a4
>> >> dl_server_start+0x74
>> >> enqueue_task_fair+0x568
>> >> enqueue_task+0xac
>> >> do_activate_task+0x14c
>> >> ttwu_do_activate+0xcc
>> >> try_to_wake_up+0x6c8
>> >> default_wake_function+0x20
>> >> autoremove_wake_function+0x1c
>> >> __wake_up+0xac
>> >> wakeup_kswapd+0x19c
>> >> wake_all_kswapds+0x78
>> >> __alloc_pages_slowpath+0x1ac
>> >> __alloc_pages_noprof+0x298
>> >> stack_depot_save_flags+0x6b0
>> >> stack_depot_save+0x14
>> >> set_track_prepare+0x5c
>> >> ___slab_alloc+0xccc
>> >> __kmalloc_cache_noprof+0x470
>> >> __set_page_owner+0x2bc
>> >> post_alloc_hook[jt]+0x1b8
>> >> prep_new_page+0x28
>> >> get_page_from_freelist+0x1edc
>> >> __alloc_pages_noprof+0x13c
>> >> alloc_slab_page+0x244
>> >> allocate_slab+0x7c
>> >> ___slab_alloc+0x8e8
>> >> kmem_cache_alloc_noprof+0x450
>> >> debug_objects_fill_pool+0x22c
>> >> debug_object_activate+0x40
>> >> enqueue_hrtimer[jt]+0xdc
>> >> hrtimer_start_range_ns+0x5f8
>> >> ...
>> >> 
>> >> Signed-off-by: yangshiguang <yangshiguang@xiaomi.com>
>> >> Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects")
>> >> ---
>> >> v1 -> v2:
>> >>     propagate gfp flags to set_track_prepare()
>> >> 
>> >> [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com 
>> >> ---
>> >>  mm/slub.c | 21 +++++++++++----------
>> >>  1 file changed, 11 insertions(+), 10 deletions(-)
>> >> 
>> >> diff --git a/mm/slub.c b/mm/slub.c
>> >> index 30003763d224..dba905bf1e03 100644
>> >> --- a/mm/slub.c
>> >> +++ b/mm/slub.c
>> >> @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object,
>> >>  }
>> >>  
>> >>  #ifdef CONFIG_STACKDEPOT
>> >> -static noinline depot_stack_handle_t set_track_prepare(void)
>> >> +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
>> >>  {
>> >>  	depot_stack_handle_t handle;
>> >>  	unsigned long entries[TRACK_ADDRS_COUNT];
>> >>  	unsigned int nr_entries;
>> >> +	gfp_flags &= GFP_NOWAIT;
>> >
>> >Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows
>> >direct reclamation?
>> >
>> 
>> Hi Harry,
>> 
>> The original allocation is GFP_NOWAIT.
>> So I think it's better not to increase the allocation cost here.
>
>I don't think the allocation cost is important here, because collecting
>a stack trace for each alloc/free is quite slow anyway. And we don't really
>care about performance in debug caches (it isn't designed to be
>performant).
>
>I think it was GFP_NOWAIT because it was considered safe without
>regard to the GFP flags passed, rather than due to performance
>considerations.
>
Hi harry,

Is that so?
gfp_flags &= (GFP_NOWAIT | __GFP_DIRECT_RECLAIM);


>-- 
>Cheers,
>Harry / Hyeonggon

Re: Re: Re: [PATCH v2] mm: slub: avoid wake up kswapd in set_track_prepare

Posted by Harry Yoo 1 month, 2 weeks ago

On Mon, Aug 18, 2025 at 10:07:40AM +0800, yangshiguang wrote:
> 
> 
> At 2025-08-16 18:46:12, "Harry Yoo" <harry.yoo@oracle.com> wrote:
> >On Sat, Aug 16, 2025 at 06:05:15PM +0800, yangshiguang wrote:
> >> 
> >> 
> >> At 2025-08-16 16:25:25, "Harry Yoo" <harry.yoo@oracle.com> wrote:
> >> >On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote:
> >> >> From: yangshiguang <yangshiguang@xiaomi.com>
> >> >> 
> >> >> From: yangshiguang <yangshiguang@xiaomi.com>
> >> >> 
> >> >> set_track_prepare() can incur lock recursion.
> >> >> The issue is that it is called from hrtimer_start_range_ns
> >> >> holding the per_cpu(hrtimer_bases)[n].lock, but when enabled
> >> >> CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare,
> >> >> and try to hold the per_cpu(hrtimer_bases)[n].lock.
> >> >> 
> >> >> So avoid waking up kswapd.The oops looks something like:
> >> >
> >> >Hi yangshiguang, 
> >> >
> >> >In the next revision, could you please elaborate the commit message
> >> >to reflect how this change avoids waking up kswapd?
> >> >
> >> 
> >> of course. Thanks for the reminder.
> >> 
> >> >> BUG: spinlock recursion on CPU#3, swapper/3/0
> >> >>  lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3
> >> >> Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT)
> >> >> Call trace:
> >> >> spin_bug+0x0
> >> >> _raw_spin_lock_irqsave+0x80
> >> >> hrtimer_try_to_cancel+0x94
> >> >> task_contending+0x10c
> >> >> enqueue_dl_entity+0x2a4
> >> >> dl_server_start+0x74
> >> >> enqueue_task_fair+0x568
> >> >> enqueue_task+0xac
> >> >> do_activate_task+0x14c
> >> >> ttwu_do_activate+0xcc
> >> >> try_to_wake_up+0x6c8
> >> >> default_wake_function+0x20
> >> >> autoremove_wake_function+0x1c
> >> >> __wake_up+0xac
> >> >> wakeup_kswapd+0x19c
> >> >> wake_all_kswapds+0x78
> >> >> __alloc_pages_slowpath+0x1ac
> >> >> __alloc_pages_noprof+0x298
> >> >> stack_depot_save_flags+0x6b0
> >> >> stack_depot_save+0x14
> >> >> set_track_prepare+0x5c
> >> >> ___slab_alloc+0xccc
> >> >> __kmalloc_cache_noprof+0x470
> >> >> __set_page_owner+0x2bc
> >> >> post_alloc_hook[jt]+0x1b8
> >> >> prep_new_page+0x28
> >> >> get_page_from_freelist+0x1edc
> >> >> __alloc_pages_noprof+0x13c
> >> >> alloc_slab_page+0x244
> >> >> allocate_slab+0x7c
> >> >> ___slab_alloc+0x8e8
> >> >> kmem_cache_alloc_noprof+0x450
> >> >> debug_objects_fill_pool+0x22c
> >> >> debug_object_activate+0x40
> >> >> enqueue_hrtimer[jt]+0xdc
> >> >> hrtimer_start_range_ns+0x5f8
> >> >> ...
> >> >> 
> >> >> Signed-off-by: yangshiguang <yangshiguang@xiaomi.com>
> >> >> Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects")
> >> >> ---
> >> >> v1 -> v2:
> >> >>     propagate gfp flags to set_track_prepare()
> >> >> 
> >> >> [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com 
> >> >> ---
> >> >>  mm/slub.c | 21 +++++++++++----------
> >> >>  1 file changed, 11 insertions(+), 10 deletions(-)
> >> >> 
> >> >> diff --git a/mm/slub.c b/mm/slub.c
> >> >> index 30003763d224..dba905bf1e03 100644
> >> >> --- a/mm/slub.c
> >> >> +++ b/mm/slub.c
> >> >> @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object,
> >> >>  }
> >> >>  
> >> >>  #ifdef CONFIG_STACKDEPOT
> >> >> -static noinline depot_stack_handle_t set_track_prepare(void)
> >> >> +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
> >> >>  {
> >> >>  	depot_stack_handle_t handle;
> >> >>  	unsigned long entries[TRACK_ADDRS_COUNT];
> >> >>  	unsigned int nr_entries;
> >> >> +	gfp_flags &= GFP_NOWAIT;
> >> >
> >> >Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows
> >> >direct reclamation?
> >> >
> >> 
> >> Hi Harry,
> >> 
> >> The original allocation is GFP_NOWAIT.
> >> So I think it's better not to increase the allocation cost here.
> >
> >I don't think the allocation cost is important here, because collecting
> >a stack trace for each alloc/free is quite slow anyway. And we don't really
> >care about performance in debug caches (it isn't designed to be
> >performant).
> >
> >I think it was GFP_NOWAIT because it was considered safe without
> >regard to the GFP flags passed, rather than due to performance
> >considerations.
> >
> Hi harry,
> 
> Is that so?
> gfp_flags &= (GFP_NOWAIT | __GFP_DIRECT_RECLAIM);

This still clears gfp flags passed by the caller to the allocator.
Why not use gfp_flags directly without clearing some flags?

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v2] mm: slub: avoid wake up kswapd in set_track_prepare

Posted by Giorgi Tchankvetadze 1 month, 2 weeks ago

Rather than masking to GFP_NOWAIT—which still allows kswapd to be 
woken—let’s strip every reclaim bit (`__GFP_RECLAIM` and 
`__GFP_KSWAPD_RECLAIM`) and add `__GFP_NORETRY | __GFP_NOWARN`. That 
guarantees we never enter the slow path that calls `wakeup_kswapd()`, so 
the timer-base lock can’t be re-entered.

On 8/16/2025 12:25 PM, Harry Yoo wrote:
> On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote:
>> From: yangshiguang <yangshiguang@xiaomi.com>
>>
>> From: yangshiguang <yangshiguang@xiaomi.com>
>>
>> set_track_prepare() can incur lock recursion.
>> The issue is that it is called from hrtimer_start_range_ns
>> holding the per_cpu(hrtimer_bases)[n].lock, but when enabled
>> CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare,
>> and try to hold the per_cpu(hrtimer_bases)[n].lock.
>>
>> So avoid waking up kswapd.The oops looks something like:
> 
> Hi yangshiguang,
> 
> In the next revision, could you please elaborate the commit message
> to reflect how this change avoids waking up kswapd?
> 
>> BUG: spinlock recursion on CPU#3, swapper/3/0
>>   lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3
>> Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT)
>> Call trace:
>> spin_bug+0x0
>> _raw_spin_lock_irqsave+0x80
>> hrtimer_try_to_cancel+0x94
>> task_contending+0x10c
>> enqueue_dl_entity+0x2a4
>> dl_server_start+0x74
>> enqueue_task_fair+0x568
>> enqueue_task+0xac
>> do_activate_task+0x14c
>> ttwu_do_activate+0xcc
>> try_to_wake_up+0x6c8
>> default_wake_function+0x20
>> autoremove_wake_function+0x1c
>> __wake_up+0xac
>> wakeup_kswapd+0x19c
>> wake_all_kswapds+0x78
>> __alloc_pages_slowpath+0x1ac
>> __alloc_pages_noprof+0x298
>> stack_depot_save_flags+0x6b0
>> stack_depot_save+0x14
>> set_track_prepare+0x5c
>> ___slab_alloc+0xccc
>> __kmalloc_cache_noprof+0x470
>> __set_page_owner+0x2bc
>> post_alloc_hook[jt]+0x1b8
>> prep_new_page+0x28
>> get_page_from_freelist+0x1edc
>> __alloc_pages_noprof+0x13c
>> alloc_slab_page+0x244
>> allocate_slab+0x7c
>> ___slab_alloc+0x8e8
>> kmem_cache_alloc_noprof+0x450
>> debug_objects_fill_pool+0x22c
>> debug_object_activate+0x40
>> enqueue_hrtimer[jt]+0xdc
>> hrtimer_start_range_ns+0x5f8
>> ...
>>
>> Signed-off-by: yangshiguang <yangshiguang@xiaomi.com>
>> Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects")
>> ---
>> v1 -> v2:
>>      propagate gfp flags to set_track_prepare()
>>
>> [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com
>> ---
>>   mm/slub.c | 21 +++++++++++----------
>>   1 file changed, 11 insertions(+), 10 deletions(-)
>>
>> diff --git a/mm/slub.c b/mm/slub.c
>> index 30003763d224..dba905bf1e03 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object,
>>   }
>>   
>>   #ifdef CONFIG_STACKDEPOT
>> -static noinline depot_stack_handle_t set_track_prepare(void)
>> +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
>>   {
>>   	depot_stack_handle_t handle;
>>   	unsigned long entries[TRACK_ADDRS_COUNT];
>>   	unsigned int nr_entries;
>> +	gfp_flags &= GFP_NOWAIT;
> 
> Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows
> direct reclamation?
> 
>>   	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
>> -	handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
>> +	handle = stack_depot_save(entries, nr_entries, gfp_flags);
>>   
>>   	return handle;
>>   }
>>   #else
>> -static inline depot_stack_handle_t set_track_prepare(void)
>> +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
>>   {
>>   	return 0;
>>   }
>> @@ -4422,7 +4423,7 @@ static noinline void free_to_partial_list(
>>   	depot_stack_handle_t handle = 0;
>>   
>>   	if (s->flags & SLAB_STORE_USER)
>> -		handle = set_track_prepare();
>> +		handle = set_track_prepare(GFP_NOWAIT);
> 
> I don't think it is safe to use GFP_NOWAIT during free?
> 
> Let's say fill_pool() -> kmem_alloc_batch() fails to allocate an object
> and then free_object_list() frees allocated objects,
> set_track_prepare(GFP_NOWAIT) may wake up kswapd, and the same deadlock
> you reported will occur.
> 
> So I think it should be __GFP_NOWARN?
>

Re:Re: [PATCH v2] mm: slub: avoid wake up kswapd in set_track_prepare

Posted by yangshiguang 1 month, 2 weeks ago




At 2025-08-16 17:33:00, "Giorgi Tchankvetadze" <giorgitchankvetadze1997@gmail.com> wrote:
>Rather than masking to GFP_NOWAIT—which still allows kswapd to be 
>woken—let’s strip every reclaim bit (`__GFP_RECLAIM` and 
>`__GFP_KSWAPD_RECLAIM`) and add `__GFP_NORETRY | __GFP_NOWARN`. That 
>guarantees we never enter the slow path that calls `wakeup_kswapd()`, so 
>the timer-base lock can’t be re-entered.

>

Hi Giorgi,
Thanks for your advice.
But more scenarios allow kswapd to be woken up.
It might be better to allow kswapd to be woken up.

>On 8/16/2025 12:25 PM, Harry Yoo wrote:
>> On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote:
>>> From: yangshiguang <yangshiguang@xiaomi.com>
>>>
>>> From: yangshiguang <yangshiguang@xiaomi.com>
>>>
>>> set_track_prepare() can incur lock recursion.
>>> The issue is that it is called from hrtimer_start_range_ns
>>> holding the per_cpu(hrtimer_bases)[n].lock, but when enabled
>>> CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare,
>>> and try to hold the per_cpu(hrtimer_bases)[n].lock.
>>>
>>> So avoid waking up kswapd.The oops looks something like:
>> 
>> Hi yangshiguang,
>> 
>> In the next revision, could you please elaborate the commit message
>> to reflect how this change avoids waking up kswapd?
>> 
>>> BUG: spinlock recursion on CPU#3, swapper/3/0
>>>   lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3
>>> Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT)
>>> Call trace:
>>> spin_bug+0x0
>>> _raw_spin_lock_irqsave+0x80
>>> hrtimer_try_to_cancel+0x94
>>> task_contending+0x10c
>>> enqueue_dl_entity+0x2a4
>>> dl_server_start+0x74
>>> enqueue_task_fair+0x568
>>> enqueue_task+0xac
>>> do_activate_task+0x14c
>>> ttwu_do_activate+0xcc
>>> try_to_wake_up+0x6c8
>>> default_wake_function+0x20
>>> autoremove_wake_function+0x1c
>>> __wake_up+0xac
>>> wakeup_kswapd+0x19c
>>> wake_all_kswapds+0x78
>>> __alloc_pages_slowpath+0x1ac
>>> __alloc_pages_noprof+0x298
>>> stack_depot_save_flags+0x6b0
>>> stack_depot_save+0x14
>>> set_track_prepare+0x5c
>>> ___slab_alloc+0xccc
>>> __kmalloc_cache_noprof+0x470
>>> __set_page_owner+0x2bc
>>> post_alloc_hook[jt]+0x1b8
>>> prep_new_page+0x28
>>> get_page_from_freelist+0x1edc
>>> __alloc_pages_noprof+0x13c
>>> alloc_slab_page+0x244
>>> allocate_slab+0x7c
>>> ___slab_alloc+0x8e8
>>> kmem_cache_alloc_noprof+0x450
>>> debug_objects_fill_pool+0x22c
>>> debug_object_activate+0x40
>>> enqueue_hrtimer[jt]+0xdc
>>> hrtimer_start_range_ns+0x5f8
>>> ...
>>>
>>> Signed-off-by: yangshiguang <yangshiguang@xiaomi.com>
>>> Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects")
>>> ---
>>> v1 -> v2:
>>>      propagate gfp flags to set_track_prepare()
>>>
>>> [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com
>>> ---
>>>   mm/slub.c | 21 +++++++++++----------
>>>   1 file changed, 11 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/mm/slub.c b/mm/slub.c
>>> index 30003763d224..dba905bf1e03 100644
>>> --- a/mm/slub.c
>>> +++ b/mm/slub.c
>>> @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object,
>>>   }
>>>   
>>>   #ifdef CONFIG_STACKDEPOT
>>> -static noinline depot_stack_handle_t set_track_prepare(void)
>>> +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
>>>   {
>>>   	depot_stack_handle_t handle;
>>>   	unsigned long entries[TRACK_ADDRS_COUNT];
>>>   	unsigned int nr_entries;
>>> +	gfp_flags &= GFP_NOWAIT;
>> 
>> Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows
>> direct reclamation?
>> 
>>>   	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
>>> -	handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
>>> +	handle = stack_depot_save(entries, nr_entries, gfp_flags);
>>>   
>>>   	return handle;
>>>   }
>>>   #else
>>> -static inline depot_stack_handle_t set_track_prepare(void)
>>> +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
>>>   {
>>>   	return 0;
>>>   }
>>> @@ -4422,7 +4423,7 @@ static noinline void free_to_partial_list(
>>>   	depot_stack_handle_t handle = 0;
>>>   
>>>   	if (s->flags & SLAB_STORE_USER)
>>> -		handle = set_track_prepare();
>>> +		handle = set_track_prepare(GFP_NOWAIT);
>> 
>> I don't think it is safe to use GFP_NOWAIT during free?
>> 
>> Let's say fill_pool() -> kmem_alloc_batch() fails to allocate an object
>> and then free_object_list() frees allocated objects,
>> set_track_prepare(GFP_NOWAIT) may wake up kswapd, and the same deadlock
>> you reported will occur.
>> 
>> So I think it should be __GFP_NOWARN?
>>