mm/slub.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-)
From: yangshiguang <yangshiguang@xiaomi.com>
From: yangshiguang <yangshiguang@xiaomi.com>
set_track_prepare() can incur lock recursion.
The issue is that it is called from hrtimer_start_range_ns
holding the per_cpu(hrtimer_bases)[n].lock, but when enabled
CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare,
and try to hold the per_cpu(hrtimer_bases)[n].lock.
So avoid waking up kswapd.The oops looks something like:
BUG: spinlock recursion on CPU#3, swapper/3/0
lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3
Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT)
Call trace:
spin_bug+0x0
_raw_spin_lock_irqsave+0x80
hrtimer_try_to_cancel+0x94
task_contending+0x10c
enqueue_dl_entity+0x2a4
dl_server_start+0x74
enqueue_task_fair+0x568
enqueue_task+0xac
do_activate_task+0x14c
ttwu_do_activate+0xcc
try_to_wake_up+0x6c8
default_wake_function+0x20
autoremove_wake_function+0x1c
__wake_up+0xac
wakeup_kswapd+0x19c
wake_all_kswapds+0x78
__alloc_pages_slowpath+0x1ac
__alloc_pages_noprof+0x298
stack_depot_save_flags+0x6b0
stack_depot_save+0x14
set_track_prepare+0x5c
___slab_alloc+0xccc
__kmalloc_cache_noprof+0x470
__set_page_owner+0x2bc
post_alloc_hook[jt]+0x1b8
prep_new_page+0x28
get_page_from_freelist+0x1edc
__alloc_pages_noprof+0x13c
alloc_slab_page+0x244
allocate_slab+0x7c
___slab_alloc+0x8e8
kmem_cache_alloc_noprof+0x450
debug_objects_fill_pool+0x22c
debug_object_activate+0x40
enqueue_hrtimer[jt]+0xdc
hrtimer_start_range_ns+0x5f8
...
Signed-off-by: yangshiguang <yangshiguang@xiaomi.com>
Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects")
---
v1 -> v2:
propagate gfp flags to set_track_prepare()
[1]https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com/
---
mm/slub.c | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c
index 30003763d224..dba905bf1e03 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object,
}
#ifdef CONFIG_STACKDEPOT
-static noinline depot_stack_handle_t set_track_prepare(void)
+static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
{
depot_stack_handle_t handle;
unsigned long entries[TRACK_ADDRS_COUNT];
unsigned int nr_entries;
+ gfp_flags &= GFP_NOWAIT;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
- handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
+ handle = stack_depot_save(entries, nr_entries, gfp_flags);
return handle;
}
#else
-static inline depot_stack_handle_t set_track_prepare(void)
+static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
{
return 0;
}
@@ -996,9 +997,9 @@ static void set_track_update(struct kmem_cache *s, void *object,
}
static __always_inline void set_track(struct kmem_cache *s, void *object,
- enum track_item alloc, unsigned long addr)
+ enum track_item alloc, unsigned long addr, gfp_t gfp_flags)
{
- depot_stack_handle_t handle = set_track_prepare();
+ depot_stack_handle_t handle = set_track_prepare(gfp_flags);
set_track_update(s, object, alloc, addr, handle);
}
@@ -1921,9 +1922,9 @@ static inline bool free_debug_processing(struct kmem_cache *s,
static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
static inline int check_object(struct kmem_cache *s, struct slab *slab,
void *object, u8 val) { return 1; }
-static inline depot_stack_handle_t set_track_prepare(void) { return 0; }
+static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; }
static inline void set_track(struct kmem_cache *s, void *object,
- enum track_item alloc, unsigned long addr) {}
+ enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {}
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct slab *slab) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
@@ -3878,7 +3879,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
* tracking info and return the object.
*/
if (s->flags & SLAB_STORE_USER)
- set_track(s, freelist, TRACK_ALLOC, addr);
+ set_track(s, freelist, TRACK_ALLOC, addr, gfpflags);
return freelist;
}
@@ -3910,7 +3911,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
goto new_objects;
if (s->flags & SLAB_STORE_USER)
- set_track(s, freelist, TRACK_ALLOC, addr);
+ set_track(s, freelist, TRACK_ALLOC, addr, gfpflags);
return freelist;
}
@@ -4422,7 +4423,7 @@ static noinline void free_to_partial_list(
depot_stack_handle_t handle = 0;
if (s->flags & SLAB_STORE_USER)
- handle = set_track_prepare();
+ handle = set_track_prepare(GFP_NOWAIT);
spin_lock_irqsave(&n->list_lock, flags);
--
2.43.0
On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote: > From: yangshiguang <yangshiguang@xiaomi.com> > > From: yangshiguang <yangshiguang@xiaomi.com> > > set_track_prepare() can incur lock recursion. > The issue is that it is called from hrtimer_start_range_ns > holding the per_cpu(hrtimer_bases)[n].lock, but when enabled > CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare, > and try to hold the per_cpu(hrtimer_bases)[n].lock. > > So avoid waking up kswapd.The oops looks something like: Hi yangshiguang, In the next revision, could you please elaborate the commit message to reflect how this change avoids waking up kswapd? > BUG: spinlock recursion on CPU#3, swapper/3/0 > lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3 > Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT) > Call trace: > spin_bug+0x0 > _raw_spin_lock_irqsave+0x80 > hrtimer_try_to_cancel+0x94 > task_contending+0x10c > enqueue_dl_entity+0x2a4 > dl_server_start+0x74 > enqueue_task_fair+0x568 > enqueue_task+0xac > do_activate_task+0x14c > ttwu_do_activate+0xcc > try_to_wake_up+0x6c8 > default_wake_function+0x20 > autoremove_wake_function+0x1c > __wake_up+0xac > wakeup_kswapd+0x19c > wake_all_kswapds+0x78 > __alloc_pages_slowpath+0x1ac > __alloc_pages_noprof+0x298 > stack_depot_save_flags+0x6b0 > stack_depot_save+0x14 > set_track_prepare+0x5c > ___slab_alloc+0xccc > __kmalloc_cache_noprof+0x470 > __set_page_owner+0x2bc > post_alloc_hook[jt]+0x1b8 > prep_new_page+0x28 > get_page_from_freelist+0x1edc > __alloc_pages_noprof+0x13c > alloc_slab_page+0x244 > allocate_slab+0x7c > ___slab_alloc+0x8e8 > kmem_cache_alloc_noprof+0x450 > debug_objects_fill_pool+0x22c > debug_object_activate+0x40 > enqueue_hrtimer[jt]+0xdc > hrtimer_start_range_ns+0x5f8 > ... > > Signed-off-by: yangshiguang <yangshiguang@xiaomi.com> > Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects") > --- > v1 -> v2: > propagate gfp flags to set_track_prepare() > > [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com > --- > mm/slub.c | 21 +++++++++++---------- > 1 file changed, 11 insertions(+), 10 deletions(-) > > diff --git a/mm/slub.c b/mm/slub.c > index 30003763d224..dba905bf1e03 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object, > } > > #ifdef CONFIG_STACKDEPOT > -static noinline depot_stack_handle_t set_track_prepare(void) > +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) > { > depot_stack_handle_t handle; > unsigned long entries[TRACK_ADDRS_COUNT]; > unsigned int nr_entries; > + gfp_flags &= GFP_NOWAIT; Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows direct reclamation? > nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); > - handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); > + handle = stack_depot_save(entries, nr_entries, gfp_flags); > > return handle; > } > #else > -static inline depot_stack_handle_t set_track_prepare(void) > +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) > { > return 0; > } > @@ -4422,7 +4423,7 @@ static noinline void free_to_partial_list( > depot_stack_handle_t handle = 0; > > if (s->flags & SLAB_STORE_USER) > - handle = set_track_prepare(); > + handle = set_track_prepare(GFP_NOWAIT); I don't think it is safe to use GFP_NOWAIT during free? Let's say fill_pool() -> kmem_alloc_batch() fails to allocate an object and then free_object_list() frees allocated objects, set_track_prepare(GFP_NOWAIT) may wake up kswapd, and the same deadlock you reported will occur. So I think it should be __GFP_NOWARN? -- Cheers, Harry / Hyeonggon
At 2025-08-16 16:25:25, "Harry Yoo" <harry.yoo@oracle.com> wrote: >On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote: >> From: yangshiguang <yangshiguang@xiaomi.com> >> >> From: yangshiguang <yangshiguang@xiaomi.com> >> >> set_track_prepare() can incur lock recursion. >> The issue is that it is called from hrtimer_start_range_ns >> holding the per_cpu(hrtimer_bases)[n].lock, but when enabled >> CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare, >> and try to hold the per_cpu(hrtimer_bases)[n].lock. >> >> So avoid waking up kswapd.The oops looks something like: > >Hi yangshiguang, > >In the next revision, could you please elaborate the commit message >to reflect how this change avoids waking up kswapd? > of course. Thanks for the reminder. >> BUG: spinlock recursion on CPU#3, swapper/3/0 >> lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3 >> Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT) >> Call trace: >> spin_bug+0x0 >> _raw_spin_lock_irqsave+0x80 >> hrtimer_try_to_cancel+0x94 >> task_contending+0x10c >> enqueue_dl_entity+0x2a4 >> dl_server_start+0x74 >> enqueue_task_fair+0x568 >> enqueue_task+0xac >> do_activate_task+0x14c >> ttwu_do_activate+0xcc >> try_to_wake_up+0x6c8 >> default_wake_function+0x20 >> autoremove_wake_function+0x1c >> __wake_up+0xac >> wakeup_kswapd+0x19c >> wake_all_kswapds+0x78 >> __alloc_pages_slowpath+0x1ac >> __alloc_pages_noprof+0x298 >> stack_depot_save_flags+0x6b0 >> stack_depot_save+0x14 >> set_track_prepare+0x5c >> ___slab_alloc+0xccc >> __kmalloc_cache_noprof+0x470 >> __set_page_owner+0x2bc >> post_alloc_hook[jt]+0x1b8 >> prep_new_page+0x28 >> get_page_from_freelist+0x1edc >> __alloc_pages_noprof+0x13c >> alloc_slab_page+0x244 >> allocate_slab+0x7c >> ___slab_alloc+0x8e8 >> kmem_cache_alloc_noprof+0x450 >> debug_objects_fill_pool+0x22c >> debug_object_activate+0x40 >> enqueue_hrtimer[jt]+0xdc >> hrtimer_start_range_ns+0x5f8 >> ... >> >> Signed-off-by: yangshiguang <yangshiguang@xiaomi.com> >> Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects") >> --- >> v1 -> v2: >> propagate gfp flags to set_track_prepare() >> >> [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com >> --- >> mm/slub.c | 21 +++++++++++---------- >> 1 file changed, 11 insertions(+), 10 deletions(-) >> >> diff --git a/mm/slub.c b/mm/slub.c >> index 30003763d224..dba905bf1e03 100644 >> --- a/mm/slub.c >> +++ b/mm/slub.c >> @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object, >> } >> >> #ifdef CONFIG_STACKDEPOT >> -static noinline depot_stack_handle_t set_track_prepare(void) >> +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) >> { >> depot_stack_handle_t handle; >> unsigned long entries[TRACK_ADDRS_COUNT]; >> unsigned int nr_entries; >> + gfp_flags &= GFP_NOWAIT; > >Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows >direct reclamation? > Hi Harry, The original allocation is GFP_NOWAIT. So I think it's better not to increase the allocation cost here. >> nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); >> - handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); >> + handle = stack_depot_save(entries, nr_entries, gfp_flags); >> >> return handle; >> } >> #else >> -static inline depot_stack_handle_t set_track_prepare(void) >> +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) >> { >> return 0; >> } >> @@ -4422,7 +4423,7 @@ static noinline void free_to_partial_list( >> depot_stack_handle_t handle = 0; >> >> if (s->flags & SLAB_STORE_USER) >> - handle = set_track_prepare(); >> + handle = set_track_prepare(GFP_NOWAIT); > >I don't think it is safe to use GFP_NOWAIT during free? > >Let's say fill_pool() -> kmem_alloc_batch() fails to allocate an object >and then free_object_list() frees allocated objects, >set_track_prepare(GFP_NOWAIT) may wake up kswapd, and the same deadlock >you reported will occur. > >So I think it should be __GFP_NOWARN? > Yes, this ensures safety. >-- >Cheers, >Harry / Hyeonggon
On Sat, Aug 16, 2025 at 06:05:15PM +0800, yangshiguang wrote: > > > At 2025-08-16 16:25:25, "Harry Yoo" <harry.yoo@oracle.com> wrote: > >On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote: > >> From: yangshiguang <yangshiguang@xiaomi.com> > >> > >> From: yangshiguang <yangshiguang@xiaomi.com> > >> > >> set_track_prepare() can incur lock recursion. > >> The issue is that it is called from hrtimer_start_range_ns > >> holding the per_cpu(hrtimer_bases)[n].lock, but when enabled > >> CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare, > >> and try to hold the per_cpu(hrtimer_bases)[n].lock. > >> > >> So avoid waking up kswapd.The oops looks something like: > > > >Hi yangshiguang, > > > >In the next revision, could you please elaborate the commit message > >to reflect how this change avoids waking up kswapd? > > > > of course. Thanks for the reminder. > > >> BUG: spinlock recursion on CPU#3, swapper/3/0 > >> lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3 > >> Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT) > >> Call trace: > >> spin_bug+0x0 > >> _raw_spin_lock_irqsave+0x80 > >> hrtimer_try_to_cancel+0x94 > >> task_contending+0x10c > >> enqueue_dl_entity+0x2a4 > >> dl_server_start+0x74 > >> enqueue_task_fair+0x568 > >> enqueue_task+0xac > >> do_activate_task+0x14c > >> ttwu_do_activate+0xcc > >> try_to_wake_up+0x6c8 > >> default_wake_function+0x20 > >> autoremove_wake_function+0x1c > >> __wake_up+0xac > >> wakeup_kswapd+0x19c > >> wake_all_kswapds+0x78 > >> __alloc_pages_slowpath+0x1ac > >> __alloc_pages_noprof+0x298 > >> stack_depot_save_flags+0x6b0 > >> stack_depot_save+0x14 > >> set_track_prepare+0x5c > >> ___slab_alloc+0xccc > >> __kmalloc_cache_noprof+0x470 > >> __set_page_owner+0x2bc > >> post_alloc_hook[jt]+0x1b8 > >> prep_new_page+0x28 > >> get_page_from_freelist+0x1edc > >> __alloc_pages_noprof+0x13c > >> alloc_slab_page+0x244 > >> allocate_slab+0x7c > >> ___slab_alloc+0x8e8 > >> kmem_cache_alloc_noprof+0x450 > >> debug_objects_fill_pool+0x22c > >> debug_object_activate+0x40 > >> enqueue_hrtimer[jt]+0xdc > >> hrtimer_start_range_ns+0x5f8 > >> ... > >> > >> Signed-off-by: yangshiguang <yangshiguang@xiaomi.com> > >> Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects") > >> --- > >> v1 -> v2: > >> propagate gfp flags to set_track_prepare() > >> > >> [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com > >> --- > >> mm/slub.c | 21 +++++++++++---------- > >> 1 file changed, 11 insertions(+), 10 deletions(-) > >> > >> diff --git a/mm/slub.c b/mm/slub.c > >> index 30003763d224..dba905bf1e03 100644 > >> --- a/mm/slub.c > >> +++ b/mm/slub.c > >> @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object, > >> } > >> > >> #ifdef CONFIG_STACKDEPOT > >> -static noinline depot_stack_handle_t set_track_prepare(void) > >> +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) > >> { > >> depot_stack_handle_t handle; > >> unsigned long entries[TRACK_ADDRS_COUNT]; > >> unsigned int nr_entries; > >> + gfp_flags &= GFP_NOWAIT; > > > >Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows > >direct reclamation? > > > > Hi Harry, > > The original allocation is GFP_NOWAIT. > So I think it's better not to increase the allocation cost here. I don't think the allocation cost is important here, because collecting a stack trace for each alloc/free is quite slow anyway. And we don't really care about performance in debug caches (it isn't designed to be performant). I think it was GFP_NOWAIT because it was considered safe without regard to the GFP flags passed, rather than due to performance considerations. -- Cheers, Harry / Hyeonggon
At 2025-08-16 18:46:12, "Harry Yoo" <harry.yoo@oracle.com> wrote: >On Sat, Aug 16, 2025 at 06:05:15PM +0800, yangshiguang wrote: >> >> >> At 2025-08-16 16:25:25, "Harry Yoo" <harry.yoo@oracle.com> wrote: >> >On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote: >> >> From: yangshiguang <yangshiguang@xiaomi.com> >> >> >> >> From: yangshiguang <yangshiguang@xiaomi.com> >> >> >> >> set_track_prepare() can incur lock recursion. >> >> The issue is that it is called from hrtimer_start_range_ns >> >> holding the per_cpu(hrtimer_bases)[n].lock, but when enabled >> >> CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare, >> >> and try to hold the per_cpu(hrtimer_bases)[n].lock. >> >> >> >> So avoid waking up kswapd.The oops looks something like: >> > >> >Hi yangshiguang, >> > >> >In the next revision, could you please elaborate the commit message >> >to reflect how this change avoids waking up kswapd? >> > >> >> of course. Thanks for the reminder. >> >> >> BUG: spinlock recursion on CPU#3, swapper/3/0 >> >> lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3 >> >> Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT) >> >> Call trace: >> >> spin_bug+0x0 >> >> _raw_spin_lock_irqsave+0x80 >> >> hrtimer_try_to_cancel+0x94 >> >> task_contending+0x10c >> >> enqueue_dl_entity+0x2a4 >> >> dl_server_start+0x74 >> >> enqueue_task_fair+0x568 >> >> enqueue_task+0xac >> >> do_activate_task+0x14c >> >> ttwu_do_activate+0xcc >> >> try_to_wake_up+0x6c8 >> >> default_wake_function+0x20 >> >> autoremove_wake_function+0x1c >> >> __wake_up+0xac >> >> wakeup_kswapd+0x19c >> >> wake_all_kswapds+0x78 >> >> __alloc_pages_slowpath+0x1ac >> >> __alloc_pages_noprof+0x298 >> >> stack_depot_save_flags+0x6b0 >> >> stack_depot_save+0x14 >> >> set_track_prepare+0x5c >> >> ___slab_alloc+0xccc >> >> __kmalloc_cache_noprof+0x470 >> >> __set_page_owner+0x2bc >> >> post_alloc_hook[jt]+0x1b8 >> >> prep_new_page+0x28 >> >> get_page_from_freelist+0x1edc >> >> __alloc_pages_noprof+0x13c >> >> alloc_slab_page+0x244 >> >> allocate_slab+0x7c >> >> ___slab_alloc+0x8e8 >> >> kmem_cache_alloc_noprof+0x450 >> >> debug_objects_fill_pool+0x22c >> >> debug_object_activate+0x40 >> >> enqueue_hrtimer[jt]+0xdc >> >> hrtimer_start_range_ns+0x5f8 >> >> ... >> >> >> >> Signed-off-by: yangshiguang <yangshiguang@xiaomi.com> >> >> Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects") >> >> --- >> >> v1 -> v2: >> >> propagate gfp flags to set_track_prepare() >> >> >> >> [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com >> >> --- >> >> mm/slub.c | 21 +++++++++++---------- >> >> 1 file changed, 11 insertions(+), 10 deletions(-) >> >> >> >> diff --git a/mm/slub.c b/mm/slub.c >> >> index 30003763d224..dba905bf1e03 100644 >> >> --- a/mm/slub.c >> >> +++ b/mm/slub.c >> >> @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object, >> >> } >> >> >> >> #ifdef CONFIG_STACKDEPOT >> >> -static noinline depot_stack_handle_t set_track_prepare(void) >> >> +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) >> >> { >> >> depot_stack_handle_t handle; >> >> unsigned long entries[TRACK_ADDRS_COUNT]; >> >> unsigned int nr_entries; >> >> + gfp_flags &= GFP_NOWAIT; >> > >> >Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows >> >direct reclamation? >> > >> >> Hi Harry, >> >> The original allocation is GFP_NOWAIT. >> So I think it's better not to increase the allocation cost here. > >I don't think the allocation cost is important here, because collecting >a stack trace for each alloc/free is quite slow anyway. And we don't really >care about performance in debug caches (it isn't designed to be >performant). > >I think it was GFP_NOWAIT because it was considered safe without >regard to the GFP flags passed, rather than due to performance >considerations. > Hi harry, Is that so? gfp_flags &= (GFP_NOWAIT | __GFP_DIRECT_RECLAIM); >-- >Cheers, >Harry / Hyeonggon
On Mon, Aug 18, 2025 at 10:07:40AM +0800, yangshiguang wrote: > > > At 2025-08-16 18:46:12, "Harry Yoo" <harry.yoo@oracle.com> wrote: > >On Sat, Aug 16, 2025 at 06:05:15PM +0800, yangshiguang wrote: > >> > >> > >> At 2025-08-16 16:25:25, "Harry Yoo" <harry.yoo@oracle.com> wrote: > >> >On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote: > >> >> From: yangshiguang <yangshiguang@xiaomi.com> > >> >> > >> >> From: yangshiguang <yangshiguang@xiaomi.com> > >> >> > >> >> set_track_prepare() can incur lock recursion. > >> >> The issue is that it is called from hrtimer_start_range_ns > >> >> holding the per_cpu(hrtimer_bases)[n].lock, but when enabled > >> >> CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare, > >> >> and try to hold the per_cpu(hrtimer_bases)[n].lock. > >> >> > >> >> So avoid waking up kswapd.The oops looks something like: > >> > > >> >Hi yangshiguang, > >> > > >> >In the next revision, could you please elaborate the commit message > >> >to reflect how this change avoids waking up kswapd? > >> > > >> > >> of course. Thanks for the reminder. > >> > >> >> BUG: spinlock recursion on CPU#3, swapper/3/0 > >> >> lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3 > >> >> Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT) > >> >> Call trace: > >> >> spin_bug+0x0 > >> >> _raw_spin_lock_irqsave+0x80 > >> >> hrtimer_try_to_cancel+0x94 > >> >> task_contending+0x10c > >> >> enqueue_dl_entity+0x2a4 > >> >> dl_server_start+0x74 > >> >> enqueue_task_fair+0x568 > >> >> enqueue_task+0xac > >> >> do_activate_task+0x14c > >> >> ttwu_do_activate+0xcc > >> >> try_to_wake_up+0x6c8 > >> >> default_wake_function+0x20 > >> >> autoremove_wake_function+0x1c > >> >> __wake_up+0xac > >> >> wakeup_kswapd+0x19c > >> >> wake_all_kswapds+0x78 > >> >> __alloc_pages_slowpath+0x1ac > >> >> __alloc_pages_noprof+0x298 > >> >> stack_depot_save_flags+0x6b0 > >> >> stack_depot_save+0x14 > >> >> set_track_prepare+0x5c > >> >> ___slab_alloc+0xccc > >> >> __kmalloc_cache_noprof+0x470 > >> >> __set_page_owner+0x2bc > >> >> post_alloc_hook[jt]+0x1b8 > >> >> prep_new_page+0x28 > >> >> get_page_from_freelist+0x1edc > >> >> __alloc_pages_noprof+0x13c > >> >> alloc_slab_page+0x244 > >> >> allocate_slab+0x7c > >> >> ___slab_alloc+0x8e8 > >> >> kmem_cache_alloc_noprof+0x450 > >> >> debug_objects_fill_pool+0x22c > >> >> debug_object_activate+0x40 > >> >> enqueue_hrtimer[jt]+0xdc > >> >> hrtimer_start_range_ns+0x5f8 > >> >> ... > >> >> > >> >> Signed-off-by: yangshiguang <yangshiguang@xiaomi.com> > >> >> Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects") > >> >> --- > >> >> v1 -> v2: > >> >> propagate gfp flags to set_track_prepare() > >> >> > >> >> [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com > >> >> --- > >> >> mm/slub.c | 21 +++++++++++---------- > >> >> 1 file changed, 11 insertions(+), 10 deletions(-) > >> >> > >> >> diff --git a/mm/slub.c b/mm/slub.c > >> >> index 30003763d224..dba905bf1e03 100644 > >> >> --- a/mm/slub.c > >> >> +++ b/mm/slub.c > >> >> @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object, > >> >> } > >> >> > >> >> #ifdef CONFIG_STACKDEPOT > >> >> -static noinline depot_stack_handle_t set_track_prepare(void) > >> >> +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) > >> >> { > >> >> depot_stack_handle_t handle; > >> >> unsigned long entries[TRACK_ADDRS_COUNT]; > >> >> unsigned int nr_entries; > >> >> + gfp_flags &= GFP_NOWAIT; > >> > > >> >Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows > >> >direct reclamation? > >> > > >> > >> Hi Harry, > >> > >> The original allocation is GFP_NOWAIT. > >> So I think it's better not to increase the allocation cost here. > > > >I don't think the allocation cost is important here, because collecting > >a stack trace for each alloc/free is quite slow anyway. And we don't really > >care about performance in debug caches (it isn't designed to be > >performant). > > > >I think it was GFP_NOWAIT because it was considered safe without > >regard to the GFP flags passed, rather than due to performance > >considerations. > > > Hi harry, > > Is that so? > gfp_flags &= (GFP_NOWAIT | __GFP_DIRECT_RECLAIM); This still clears gfp flags passed by the caller to the allocator. Why not use gfp_flags directly without clearing some flags? -- Cheers, Harry / Hyeonggon
Rather than masking to GFP_NOWAIT—which still allows kswapd to be woken—let’s strip every reclaim bit (`__GFP_RECLAIM` and `__GFP_KSWAPD_RECLAIM`) and add `__GFP_NORETRY | __GFP_NOWARN`. That guarantees we never enter the slow path that calls `wakeup_kswapd()`, so the timer-base lock can’t be re-entered. On 8/16/2025 12:25 PM, Harry Yoo wrote: > On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote: >> From: yangshiguang <yangshiguang@xiaomi.com> >> >> From: yangshiguang <yangshiguang@xiaomi.com> >> >> set_track_prepare() can incur lock recursion. >> The issue is that it is called from hrtimer_start_range_ns >> holding the per_cpu(hrtimer_bases)[n].lock, but when enabled >> CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare, >> and try to hold the per_cpu(hrtimer_bases)[n].lock. >> >> So avoid waking up kswapd.The oops looks something like: > > Hi yangshiguang, > > In the next revision, could you please elaborate the commit message > to reflect how this change avoids waking up kswapd? > >> BUG: spinlock recursion on CPU#3, swapper/3/0 >> lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3 >> Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT) >> Call trace: >> spin_bug+0x0 >> _raw_spin_lock_irqsave+0x80 >> hrtimer_try_to_cancel+0x94 >> task_contending+0x10c >> enqueue_dl_entity+0x2a4 >> dl_server_start+0x74 >> enqueue_task_fair+0x568 >> enqueue_task+0xac >> do_activate_task+0x14c >> ttwu_do_activate+0xcc >> try_to_wake_up+0x6c8 >> default_wake_function+0x20 >> autoremove_wake_function+0x1c >> __wake_up+0xac >> wakeup_kswapd+0x19c >> wake_all_kswapds+0x78 >> __alloc_pages_slowpath+0x1ac >> __alloc_pages_noprof+0x298 >> stack_depot_save_flags+0x6b0 >> stack_depot_save+0x14 >> set_track_prepare+0x5c >> ___slab_alloc+0xccc >> __kmalloc_cache_noprof+0x470 >> __set_page_owner+0x2bc >> post_alloc_hook[jt]+0x1b8 >> prep_new_page+0x28 >> get_page_from_freelist+0x1edc >> __alloc_pages_noprof+0x13c >> alloc_slab_page+0x244 >> allocate_slab+0x7c >> ___slab_alloc+0x8e8 >> kmem_cache_alloc_noprof+0x450 >> debug_objects_fill_pool+0x22c >> debug_object_activate+0x40 >> enqueue_hrtimer[jt]+0xdc >> hrtimer_start_range_ns+0x5f8 >> ... >> >> Signed-off-by: yangshiguang <yangshiguang@xiaomi.com> >> Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects") >> --- >> v1 -> v2: >> propagate gfp flags to set_track_prepare() >> >> [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com >> --- >> mm/slub.c | 21 +++++++++++---------- >> 1 file changed, 11 insertions(+), 10 deletions(-) >> >> diff --git a/mm/slub.c b/mm/slub.c >> index 30003763d224..dba905bf1e03 100644 >> --- a/mm/slub.c >> +++ b/mm/slub.c >> @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object, >> } >> >> #ifdef CONFIG_STACKDEPOT >> -static noinline depot_stack_handle_t set_track_prepare(void) >> +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) >> { >> depot_stack_handle_t handle; >> unsigned long entries[TRACK_ADDRS_COUNT]; >> unsigned int nr_entries; >> + gfp_flags &= GFP_NOWAIT; > > Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows > direct reclamation? > >> nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); >> - handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); >> + handle = stack_depot_save(entries, nr_entries, gfp_flags); >> >> return handle; >> } >> #else >> -static inline depot_stack_handle_t set_track_prepare(void) >> +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) >> { >> return 0; >> } >> @@ -4422,7 +4423,7 @@ static noinline void free_to_partial_list( >> depot_stack_handle_t handle = 0; >> >> if (s->flags & SLAB_STORE_USER) >> - handle = set_track_prepare(); >> + handle = set_track_prepare(GFP_NOWAIT); > > I don't think it is safe to use GFP_NOWAIT during free? > > Let's say fill_pool() -> kmem_alloc_batch() fails to allocate an object > and then free_object_list() frees allocated objects, > set_track_prepare(GFP_NOWAIT) may wake up kswapd, and the same deadlock > you reported will occur. > > So I think it should be __GFP_NOWARN? >
At 2025-08-16 17:33:00, "Giorgi Tchankvetadze" <giorgitchankvetadze1997@gmail.com> wrote: >Rather than masking to GFP_NOWAIT—which still allows kswapd to be >woken—let’s strip every reclaim bit (`__GFP_RECLAIM` and >`__GFP_KSWAPD_RECLAIM`) and add `__GFP_NORETRY | __GFP_NOWARN`. That >guarantees we never enter the slow path that calls `wakeup_kswapd()`, so >the timer-base lock can’t be re-entered. > Hi Giorgi, Thanks for your advice. But more scenarios allow kswapd to be woken up. It might be better to allow kswapd to be woken up. >On 8/16/2025 12:25 PM, Harry Yoo wrote: >> On Thu, Aug 14, 2025 at 07:16:42PM +0800, yangshiguang1011@163.com wrote: >>> From: yangshiguang <yangshiguang@xiaomi.com> >>> >>> From: yangshiguang <yangshiguang@xiaomi.com> >>> >>> set_track_prepare() can incur lock recursion. >>> The issue is that it is called from hrtimer_start_range_ns >>> holding the per_cpu(hrtimer_bases)[n].lock, but when enabled >>> CONFIG_DEBUG_OBJECTS_TIMERS, may wake up kswapd in set_track_prepare, >>> and try to hold the per_cpu(hrtimer_bases)[n].lock. >>> >>> So avoid waking up kswapd.The oops looks something like: >> >> Hi yangshiguang, >> >> In the next revision, could you please elaborate the commit message >> to reflect how this change avoids waking up kswapd? >> >>> BUG: spinlock recursion on CPU#3, swapper/3/0 >>> lock: 0xffffff8a4bf29c80, .magic: dead4ead, .owner: swapper/3/0, .owner_cpu: 3 >>> Hardware name: Qualcomm Technologies, Inc. Popsicle based on SM8850 (DT) >>> Call trace: >>> spin_bug+0x0 >>> _raw_spin_lock_irqsave+0x80 >>> hrtimer_try_to_cancel+0x94 >>> task_contending+0x10c >>> enqueue_dl_entity+0x2a4 >>> dl_server_start+0x74 >>> enqueue_task_fair+0x568 >>> enqueue_task+0xac >>> do_activate_task+0x14c >>> ttwu_do_activate+0xcc >>> try_to_wake_up+0x6c8 >>> default_wake_function+0x20 >>> autoremove_wake_function+0x1c >>> __wake_up+0xac >>> wakeup_kswapd+0x19c >>> wake_all_kswapds+0x78 >>> __alloc_pages_slowpath+0x1ac >>> __alloc_pages_noprof+0x298 >>> stack_depot_save_flags+0x6b0 >>> stack_depot_save+0x14 >>> set_track_prepare+0x5c >>> ___slab_alloc+0xccc >>> __kmalloc_cache_noprof+0x470 >>> __set_page_owner+0x2bc >>> post_alloc_hook[jt]+0x1b8 >>> prep_new_page+0x28 >>> get_page_from_freelist+0x1edc >>> __alloc_pages_noprof+0x13c >>> alloc_slab_page+0x244 >>> allocate_slab+0x7c >>> ___slab_alloc+0x8e8 >>> kmem_cache_alloc_noprof+0x450 >>> debug_objects_fill_pool+0x22c >>> debug_object_activate+0x40 >>> enqueue_hrtimer[jt]+0xdc >>> hrtimer_start_range_ns+0x5f8 >>> ... >>> >>> Signed-off-by: yangshiguang <yangshiguang@xiaomi.com> >>> Fixes: 5cf909c553e9 ("mm/slub: use stackdepot to save stack trace in objects") >>> --- >>> v1 -> v2: >>> propagate gfp flags to set_track_prepare() >>> >>> [1] https://lore.kernel.org/all/20250801065121.876793-1-yangshiguang1011@163.com >>> --- >>> mm/slub.c | 21 +++++++++++---------- >>> 1 file changed, 11 insertions(+), 10 deletions(-) >>> >>> diff --git a/mm/slub.c b/mm/slub.c >>> index 30003763d224..dba905bf1e03 100644 >>> --- a/mm/slub.c >>> +++ b/mm/slub.c >>> @@ -962,19 +962,20 @@ static struct track *get_track(struct kmem_cache *s, void *object, >>> } >>> >>> #ifdef CONFIG_STACKDEPOT >>> -static noinline depot_stack_handle_t set_track_prepare(void) >>> +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) >>> { >>> depot_stack_handle_t handle; >>> unsigned long entries[TRACK_ADDRS_COUNT]; >>> unsigned int nr_entries; >>> + gfp_flags &= GFP_NOWAIT; >> >> Is there any reason to downgrade it to GFP_NOWAIT when the gfp flag allows >> direct reclamation? >> >>> nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); >>> - handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); >>> + handle = stack_depot_save(entries, nr_entries, gfp_flags); >>> >>> return handle; >>> } >>> #else >>> -static inline depot_stack_handle_t set_track_prepare(void) >>> +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) >>> { >>> return 0; >>> } >>> @@ -4422,7 +4423,7 @@ static noinline void free_to_partial_list( >>> depot_stack_handle_t handle = 0; >>> >>> if (s->flags & SLAB_STORE_USER) >>> - handle = set_track_prepare(); >>> + handle = set_track_prepare(GFP_NOWAIT); >> >> I don't think it is safe to use GFP_NOWAIT during free? >> >> Let's say fill_pool() -> kmem_alloc_batch() fails to allocate an object >> and then free_object_list() frees allocated objects, >> set_track_prepare(GFP_NOWAIT) may wake up kswapd, and the same deadlock >> you reported will occur. >> >> So I think it should be __GFP_NOWARN? >>
© 2016 - 2025 Red Hat, Inc.