The kmalloc_nolock() implementation has several complications and
restrictions due to SLUB's cpu slab locking, lockless fastpath and
PREEMPT_RT differences. With cpu slab usage removed, we can simplify
things:
- the local_lock_cpu_slab() macros became unused, remove them
- we no longer need to set up lockdep classes on PREEMPT_RT
- we no longer need to annotate ___slab_alloc as NOKPROBE_SYMBOL
since there's no lockless cpu freelist manipulation anymore
- __slab_alloc_node() can be called from kmalloc_nolock_noprof()
unconditionally
Note that we still need __CMPXCHG_DOUBLE, because while it was removed
we don't use cmpxchg16b on cpu freelist anymore, we still use it on
slab freelist, and the alternative is slab_lock() which can be
interrupted by a nmi. Clarify the comment to mention it specifically.
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
mm/slab.h | 1 -
mm/slub.c | 100 ++++----------------------------------------------------------
2 files changed, 6 insertions(+), 95 deletions(-)
diff --git a/mm/slab.h b/mm/slab.h
index b2663cc594f3..7dde0b56a7b0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -208,7 +208,6 @@ struct kmem_cache_order_objects {
*/
struct kmem_cache {
struct kmem_cache_cpu __percpu *cpu_slab;
- struct lock_class_key lock_key;
struct slub_percpu_sheaves __percpu *cpu_sheaves;
/* Used for retrieving partial slabs, etc. */
slab_flags_t flags;
diff --git a/mm/slub.c b/mm/slub.c
index 6f5ca26bbb00..6dd7fd153391 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3679,29 +3679,12 @@ static inline unsigned int init_tid(int cpu)
static void init_kmem_cache_cpus(struct kmem_cache *s)
{
-#ifdef CONFIG_PREEMPT_RT
- /*
- * Register lockdep key for non-boot kmem caches to avoid
- * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key()
- */
- bool finegrain_lockdep = !init_section_contains(s, 1);
-#else
- /*
- * Don't bother with different lockdep classes for each
- * kmem_cache, since we only use local_trylock_irqsave().
- */
- bool finegrain_lockdep = false;
-#endif
int cpu;
struct kmem_cache_cpu *c;
- if (finegrain_lockdep)
- lockdep_register_key(&s->lock_key);
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(s->cpu_slab, cpu);
local_trylock_init(&c->lock);
- if (finegrain_lockdep)
- lockdep_set_class(&c->lock, &s->lock_key);
c->tid = init_tid(cpu);
}
}
@@ -3792,47 +3775,6 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
}
}
-/*
- * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock
- * can be acquired without a deadlock before invoking the function.
- *
- * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is
- * using local_lock_is_locked() properly before calling local_lock_cpu_slab(),
- * and kmalloc() is not used in an unsupported context.
- *
- * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave().
- * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but
- * lockdep_assert() will catch a bug in case:
- * #1
- * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock()
- * or
- * #2
- * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock()
- *
- * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt
- * disabled context. The lock will always be acquired and if needed it
- * block and sleep until the lock is available.
- * #1 is possible in !PREEMPT_RT only.
- * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock:
- * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) ->
- * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B)
- *
- * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B
- */
-#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP)
-#define local_lock_cpu_slab(s, flags) \
- local_lock_irqsave(&(s)->cpu_slab->lock, flags)
-#else
-#define local_lock_cpu_slab(s, flags) \
- do { \
- bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \
- lockdep_assert(__l); \
- } while (0)
-#endif
-
-#define local_unlock_cpu_slab(s, flags) \
- local_unlock_irqrestore(&(s)->cpu_slab->lock, flags)
-
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
unsigned long flags;
@@ -4320,19 +4262,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
return freelist;
}
-/*
- * We disallow kprobes in ___slab_alloc() to prevent reentrance
- *
- * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of
- * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf ->
- * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast()
- * manipulating c->freelist without lock.
- *
- * This does not prevent kprobe in functions called from ___slab_alloc() such as
- * local_lock_irqsave() itself, and that is fine, we only need to protect the
- * c->freelist manipulation in ___slab_alloc() itself.
- */
-NOKPROBE_SYMBOL(___slab_alloc);
static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
@@ -5201,10 +5130,11 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
/*
* kmalloc_nolock() is not supported on architectures that
- * don't implement cmpxchg16b, but debug caches don't use
- * per-cpu slab and per-cpu partial slabs. They rely on
- * kmem_cache_node->list_lock, so kmalloc_nolock() can
- * attempt to allocate from debug caches by
+ * don't implement cmpxchg16b and thus need slab_lock()
+ * which could be preempted by a nmi.
+ * But debug caches don't use that and only rely on
+ * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt
+ * to allocate from debug caches by
* spin_trylock_irqsave(&n->list_lock, ...)
*/
return NULL;
@@ -5214,27 +5144,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
if (ret)
goto success;
- ret = ERR_PTR(-EBUSY);
-
/*
* Do not call slab_alloc_node(), since trylock mode isn't
* compatible with slab_pre_alloc_hook/should_failslab and
* kfence_alloc. Hence call __slab_alloc_node() (at most twice)
* and slab_post_alloc_hook() directly.
- *
- * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
- * in irq saved region. It assumes that the same cpu will not
- * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
- * Therefore use in_nmi() to check whether particular bucket is in
- * irq protected section.
- *
- * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
- * this cpu was interrupted somewhere inside ___slab_alloc() after
- * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
- * In this case fast path with __update_cpu_freelist_fast() is not safe.
*/
- if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
- ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
+ ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
if (PTR_ERR(ret) == -EBUSY) {
if (can_retry) {
@@ -7250,10 +7166,6 @@ void __kmem_cache_release(struct kmem_cache *s)
{
cache_random_seq_destroy(s);
pcs_destroy(s);
-#ifdef CONFIG_PREEMPT_RT
- if (s->cpu_slab)
- lockdep_unregister_key(&s->lock_key);
-#endif
free_percpu(s->cpu_slab);
free_kmem_cache_nodes(s);
}
--
2.51.1
On Thu, Oct 23, 2025 at 03:52:36PM +0200, Vlastimil Babka wrote:
> The kmalloc_nolock() implementation has several complications and
> restrictions due to SLUB's cpu slab locking, lockless fastpath and
> PREEMPT_RT differences. With cpu slab usage removed, we can simplify
> things:
>
> - the local_lock_cpu_slab() macros became unused, remove them
>
> - we no longer need to set up lockdep classes on PREEMPT_RT
>
> - we no longer need to annotate ___slab_alloc as NOKPROBE_SYMBOL
> since there's no lockless cpu freelist manipulation anymore
>
> - __slab_alloc_node() can be called from kmalloc_nolock_noprof()
> unconditionally
>
> Note that we still need __CMPXCHG_DOUBLE, because while it was removed
> we don't use cmpxchg16b on cpu freelist anymore, we still use it on
> slab freelist, and the alternative is slab_lock() which can be
> interrupted by a nmi. Clarify the comment to mention it specifically.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
> mm/slab.h | 1 -
> mm/slub.c | 100 ++++----------------------------------------------------------
> 2 files changed, 6 insertions(+), 95 deletions(-)
>
> diff --git a/mm/slab.h b/mm/slab.h
> index b2663cc594f3..7dde0b56a7b0 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -208,7 +208,6 @@ struct kmem_cache_order_objects {
> */
> struct kmem_cache {
> struct kmem_cache_cpu __percpu *cpu_slab;
> - struct lock_class_key lock_key;
> struct slub_percpu_sheaves __percpu *cpu_sheaves;
> /* Used for retrieving partial slabs, etc. */
> slab_flags_t flags;
> diff --git a/mm/slub.c b/mm/slub.c
> index 6f5ca26bbb00..6dd7fd153391 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -3679,29 +3679,12 @@ static inline unsigned int init_tid(int cpu)
>
> static void init_kmem_cache_cpus(struct kmem_cache *s)
> {
> -#ifdef CONFIG_PREEMPT_RT
> - /*
> - * Register lockdep key for non-boot kmem caches to avoid
> - * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key()
> - */
> - bool finegrain_lockdep = !init_section_contains(s, 1);
> -#else
> - /*
> - * Don't bother with different lockdep classes for each
> - * kmem_cache, since we only use local_trylock_irqsave().
> - */
> - bool finegrain_lockdep = false;
> -#endif
> int cpu;
> struct kmem_cache_cpu *c;
>
> - if (finegrain_lockdep)
> - lockdep_register_key(&s->lock_key);
> for_each_possible_cpu(cpu) {
> c = per_cpu_ptr(s->cpu_slab, cpu);
> local_trylock_init(&c->lock);
> - if (finegrain_lockdep)
> - lockdep_set_class(&c->lock, &s->lock_key);
> c->tid = init_tid(cpu);
> }
> }
> @@ -3792,47 +3775,6 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
> }
> }
>
> -/*
> - * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock
> - * can be acquired without a deadlock before invoking the function.
> - *
> - * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is
> - * using local_lock_is_locked() properly before calling local_lock_cpu_slab(),
> - * and kmalloc() is not used in an unsupported context.
> - *
> - * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave().
> - * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but
> - * lockdep_assert() will catch a bug in case:
> - * #1
> - * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock()
> - * or
> - * #2
> - * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock()
> - *
> - * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt
> - * disabled context. The lock will always be acquired and if needed it
> - * block and sleep until the lock is available.
> - * #1 is possible in !PREEMPT_RT only.
> - * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock:
> - * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) ->
> - * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B)
> - *
> - * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B
> - */
> -#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP)
> -#define local_lock_cpu_slab(s, flags) \
> - local_lock_irqsave(&(s)->cpu_slab->lock, flags)
> -#else
> -#define local_lock_cpu_slab(s, flags) \
> - do { \
> - bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \
> - lockdep_assert(__l); \
> - } while (0)
> -#endif
> -
> -#define local_unlock_cpu_slab(s, flags) \
> - local_unlock_irqrestore(&(s)->cpu_slab->lock, flags)
> -
> static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
> {
> unsigned long flags;
> @@ -4320,19 +4262,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>
> return freelist;
> }
> -/*
> - * We disallow kprobes in ___slab_alloc() to prevent reentrance
> - *
> - * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of
> - * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf ->
> - * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast()
> - * manipulating c->freelist without lock.
> - *
> - * This does not prevent kprobe in functions called from ___slab_alloc() such as
> - * local_lock_irqsave() itself, and that is fine, we only need to protect the
> - * c->freelist manipulation in ___slab_alloc() itself.
> - */
> -NOKPROBE_SYMBOL(___slab_alloc);
>
> static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
> gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
> @@ -5201,10 +5130,11 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
> if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
> /*
> * kmalloc_nolock() is not supported on architectures that
> - * don't implement cmpxchg16b, but debug caches don't use
> - * per-cpu slab and per-cpu partial slabs. They rely on
> - * kmem_cache_node->list_lock, so kmalloc_nolock() can
> - * attempt to allocate from debug caches by
> + * don't implement cmpxchg16b and thus need slab_lock()
> + * which could be preempted by a nmi.
> + * But debug caches don't use that and only rely on
> + * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt
> + * to allocate from debug caches by
> * spin_trylock_irqsave(&n->list_lock, ...)
> */
> return NULL;
> @@ -5214,27 +5144,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
> if (ret)
> goto success;
>
> - ret = ERR_PTR(-EBUSY);
> -
> /*
> * Do not call slab_alloc_node(), since trylock mode isn't
> * compatible with slab_pre_alloc_hook/should_failslab and
> * kfence_alloc. Hence call __slab_alloc_node() (at most twice)
> * and slab_post_alloc_hook() directly.
> - *
> - * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
> - * in irq saved region. It assumes that the same cpu will not
> - * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
> - * Therefore use in_nmi() to check whether particular bucket is in
> - * irq protected section.
> - *
> - * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
> - * this cpu was interrupted somewhere inside ___slab_alloc() after
> - * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
> - * In this case fast path with __update_cpu_freelist_fast() is not safe.
> */
> - if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
> - ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
> + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
>
> if (PTR_ERR(ret) == -EBUSY) {
After Patch 10 is applied, the logic that returns `EBUSY` has been
removed along with the `s->cpu_slab` logic. As a result, it appears that
`__slab_alloc_node` will no longer return `EBUSY`.
> if (can_retry) {
> @@ -7250,10 +7166,6 @@ void __kmem_cache_release(struct kmem_cache *s)
> {
> cache_random_seq_destroy(s);
> pcs_destroy(s);
> -#ifdef CONFIG_PREEMPT_RT
> - if (s->cpu_slab)
> - lockdep_unregister_key(&s->lock_key);
> -#endif
> free_percpu(s->cpu_slab);
> free_kmem_cache_nodes(s);
> }
>
> --
> 2.51.1
>
On 12/16/25 03:35, Hao Li wrote:
> On Thu, Oct 23, 2025 at 03:52:36PM +0200, Vlastimil Babka wrote:
>> @@ -5214,27 +5144,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
>> if (ret)
>> goto success;
>>
>> - ret = ERR_PTR(-EBUSY);
>> -
>> /*
>> * Do not call slab_alloc_node(), since trylock mode isn't
>> * compatible with slab_pre_alloc_hook/should_failslab and
>> * kfence_alloc. Hence call __slab_alloc_node() (at most twice)
>> * and slab_post_alloc_hook() directly.
>> - *
>> - * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
>> - * in irq saved region. It assumes that the same cpu will not
>> - * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
>> - * Therefore use in_nmi() to check whether particular bucket is in
>> - * irq protected section.
>> - *
>> - * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
>> - * this cpu was interrupted somewhere inside ___slab_alloc() after
>> - * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
>> - * In this case fast path with __update_cpu_freelist_fast() is not safe.
>> */
>> - if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
>> - ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
>> + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
>>
>> if (PTR_ERR(ret) == -EBUSY) {
>
> After Patch 10 is applied, the logic that returns `EBUSY` has been
> removed along with the `s->cpu_slab` logic. As a result, it appears that
> `__slab_alloc_node` will no longer return `EBUSY`.
True, I missed that, thanks.
Since we can still get failures due to the cpu_sheaves local lock held, I
think we could just do the single retry with a larger bucket if ret is NULL.
Whlle it may be NULL for other reasons (being genuinely out of memory and
the limited context not allowing reclaim etc), it wouldn't hurt, and it's
better than to introduce returning EBUSY into various paths.
>> if (can_retry) {
>> @@ -7250,10 +7166,6 @@ void __kmem_cache_release(struct kmem_cache *s)
>> {
>> cache_random_seq_destroy(s);
>> pcs_destroy(s);
>> -#ifdef CONFIG_PREEMPT_RT
>> - if (s->cpu_slab)
>> - lockdep_unregister_key(&s->lock_key);
>> -#endif
>> free_percpu(s->cpu_slab);
>> free_kmem_cache_nodes(s);
>> }
>>
>> --
>> 2.51.1
>>
On Fri, Jan 09, 2026 at 11:11:26AM +0100, Vlastimil Babka wrote:
> On 12/16/25 03:35, Hao Li wrote:
> > On Thu, Oct 23, 2025 at 03:52:36PM +0200, Vlastimil Babka wrote:
> >> @@ -5214,27 +5144,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
> >> if (ret)
> >> goto success;
> >>
> >> - ret = ERR_PTR(-EBUSY);
> >> -
> >> /*
> >> * Do not call slab_alloc_node(), since trylock mode isn't
> >> * compatible with slab_pre_alloc_hook/should_failslab and
> >> * kfence_alloc. Hence call __slab_alloc_node() (at most twice)
> >> * and slab_post_alloc_hook() directly.
> >> - *
> >> - * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
> >> - * in irq saved region. It assumes that the same cpu will not
> >> - * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
> >> - * Therefore use in_nmi() to check whether particular bucket is in
> >> - * irq protected section.
> >> - *
> >> - * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
> >> - * this cpu was interrupted somewhere inside ___slab_alloc() after
> >> - * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
> >> - * In this case fast path with __update_cpu_freelist_fast() is not safe.
> >> */
> >> - if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
> >> - ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
> >> + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
> >>
> >> if (PTR_ERR(ret) == -EBUSY) {
> >
> > After Patch 10 is applied, the logic that returns `EBUSY` has been
> > removed along with the `s->cpu_slab` logic. As a result, it appears that
> > `__slab_alloc_node` will no longer return `EBUSY`.
>
> True, I missed that, thanks.
> Since we can still get failures due to the cpu_sheaves local lock held, I
> think we could just do the single retry with a larger bucket if ret is NULL.
Sounds good - this is a clean approach.
> Whlle it may be NULL for other reasons (being genuinely out of memory and
> the limited context not allowing reclaim etc), it wouldn't hurt, and it's
> better than to introduce returning EBUSY into various paths.
I agree - it seems cleaner for __slab_alloc_node() to return only NULL
or a valid pointer. If it could also return -EBUSY, the return semantics
would be a bit less clear.
--
Thanks,
Hao
>
> >> if (can_retry) {
> >> @@ -7250,10 +7166,6 @@ void __kmem_cache_release(struct kmem_cache *s)
> >> {
> >> cache_random_seq_destroy(s);
> >> pcs_destroy(s);
> >> -#ifdef CONFIG_PREEMPT_RT
> >> - if (s->cpu_slab)
> >> - lockdep_unregister_key(&s->lock_key);
> >> -#endif
> >> free_percpu(s->cpu_slab);
> >> free_kmem_cache_nodes(s);
> >> }
> >>
> >> --
> >> 2.51.1
> >>
>
© 2016 - 2026 Red Hat, Inc.