The kmalloc_nolock() implementation has several complications and
restrictions due to SLUB's cpu slab locking, lockless fastpath and
PREEMPT_RT differences. With cpu slab usage removed, we can simplify
things:
- relax the PREEMPT_RT context checks as they were before commit
a4ae75d1b6a2 ("slab: fix kmalloc_nolock() context check for
PREEMPT_RT") and also reference the explanation comment in the page
allocator
- the local_lock_cpu_slab() macros became unused, remove them
- we no longer need to set up lockdep classes on PREEMPT_RT
- we no longer need to annotate ___slab_alloc as NOKPROBE_SYMBOL
since there's no lockless cpu freelist manipulation anymore
- __slab_alloc_node() can be called from kmalloc_nolock_noprof()
unconditionally. It can also no longer return EBUSY. But trylock
failures can still happen so retry with the larger bucket if the
allocation fails for any reason.
Note that we still need __CMPXCHG_DOUBLE, because while it was removed
we don't use cmpxchg16b on cpu freelist anymore, we still use it on
slab freelist, and the alternative is slab_lock() which can be
interrupted by a nmi. Clarify the comment to mention it specifically.
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
mm/slab.h | 1 -
mm/slub.c | 144 +++++++++++++-------------------------------------------------
2 files changed, 29 insertions(+), 116 deletions(-)
diff --git a/mm/slab.h b/mm/slab.h
index 4efec41b6445..e9a0738133ed 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -190,7 +190,6 @@ struct kmem_cache_order_objects {
*/
struct kmem_cache {
struct kmem_cache_cpu __percpu *cpu_slab;
- struct lock_class_key lock_key;
struct slub_percpu_sheaves __percpu *cpu_sheaves;
/* Used for retrieving partial slabs, etc. */
slab_flags_t flags;
diff --git a/mm/slub.c b/mm/slub.c
index 33f218c0e8d6..8746d9d3f3a3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3694,29 +3694,12 @@ static inline unsigned int init_tid(int cpu)
static void init_kmem_cache_cpus(struct kmem_cache *s)
{
-#ifdef CONFIG_PREEMPT_RT
- /*
- * Register lockdep key for non-boot kmem caches to avoid
- * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key()
- */
- bool finegrain_lockdep = !init_section_contains(s, 1);
-#else
- /*
- * Don't bother with different lockdep classes for each
- * kmem_cache, since we only use local_trylock_irqsave().
- */
- bool finegrain_lockdep = false;
-#endif
int cpu;
struct kmem_cache_cpu *c;
- if (finegrain_lockdep)
- lockdep_register_key(&s->lock_key);
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(s->cpu_slab, cpu);
local_trylock_init(&c->lock);
- if (finegrain_lockdep)
- lockdep_set_class(&c->lock, &s->lock_key);
c->tid = init_tid(cpu);
}
}
@@ -3803,47 +3786,6 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
}
}
-/*
- * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock
- * can be acquired without a deadlock before invoking the function.
- *
- * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is
- * using local_lock_is_locked() properly before calling local_lock_cpu_slab(),
- * and kmalloc() is not used in an unsupported context.
- *
- * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave().
- * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but
- * lockdep_assert() will catch a bug in case:
- * #1
- * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock()
- * or
- * #2
- * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock()
- *
- * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt
- * disabled context. The lock will always be acquired and if needed it
- * block and sleep until the lock is available.
- * #1 is possible in !PREEMPT_RT only.
- * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock:
- * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) ->
- * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B)
- *
- * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B
- */
-#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP)
-#define local_lock_cpu_slab(s, flags) \
- local_lock_irqsave(&(s)->cpu_slab->lock, flags)
-#else
-#define local_lock_cpu_slab(s, flags) \
- do { \
- bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \
- lockdep_assert(__l); \
- } while (0)
-#endif
-
-#define local_unlock_cpu_slab(s, flags) \
- local_unlock_irqrestore(&(s)->cpu_slab->lock, flags)
-
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
unsigned long flags;
@@ -4402,20 +4344,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
return freelist;
}
-/*
- * We disallow kprobes in ___slab_alloc() to prevent reentrance
- *
- * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of
- * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf ->
- * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast()
- * manipulating c->freelist without lock.
- *
- * This does not prevent kprobe in functions called from ___slab_alloc() such as
- * local_lock_irqsave() itself, and that is fine, we only need to protect the
- * c->freelist manipulation in ___slab_alloc() itself.
- */
-NOKPROBE_SYMBOL(___slab_alloc);
-
static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
@@ -5253,13 +5181,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
if (unlikely(!size))
return ZERO_SIZE_PTR;
- if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible())
- /*
- * kmalloc_nolock() in PREEMPT_RT is not supported from
- * non-preemptible context because local_lock becomes a
- * sleeping lock on RT.
- */
+ /*
+ * See the comment for the same check in
+ * alloc_frozen_pages_nolock_noprof()
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
return NULL;
+
retry:
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
return NULL;
@@ -5268,10 +5196,11 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
/*
* kmalloc_nolock() is not supported on architectures that
- * don't implement cmpxchg16b, but debug caches don't use
- * per-cpu slab and per-cpu partial slabs. They rely on
- * kmem_cache_node->list_lock, so kmalloc_nolock() can
- * attempt to allocate from debug caches by
+ * don't implement cmpxchg16b and thus need slab_lock()
+ * which could be preempted by a nmi.
+ * But debug caches don't use that and only rely on
+ * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt
+ * to allocate from debug caches by
* spin_trylock_irqsave(&n->list_lock, ...)
*/
return NULL;
@@ -5280,42 +5209,31 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
if (ret)
goto success;
- ret = ERR_PTR(-EBUSY);
-
/*
* Do not call slab_alloc_node(), since trylock mode isn't
* compatible with slab_pre_alloc_hook/should_failslab and
* kfence_alloc. Hence call __slab_alloc_node() (at most twice)
* and slab_post_alloc_hook() directly.
- *
- * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
- * in irq saved region. It assumes that the same cpu will not
- * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
- * Therefore use in_nmi() to check whether particular bucket is in
- * irq protected section.
- *
- * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
- * this cpu was interrupted somewhere inside ___slab_alloc() after
- * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
- * In this case fast path with __update_cpu_freelist_fast() is not safe.
*/
- if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
- ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
+ ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
- if (PTR_ERR(ret) == -EBUSY) {
- if (can_retry) {
- /* pick the next kmalloc bucket */
- size = s->object_size + 1;
- /*
- * Another alternative is to
- * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT;
- * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT;
- * to retry from bucket of the same size.
- */
- can_retry = false;
- goto retry;
- }
- ret = NULL;
+ /*
+ * It's possible we failed due to trylock as we preempted someone with
+ * the sheaves locked, and the list_lock is also held by another cpu.
+ * But it should be rare that multiple kmalloc buckets would have
+ * sheaves locked, so try a larger one.
+ */
+ if (!ret && can_retry) {
+ /* pick the next kmalloc bucket */
+ size = s->object_size + 1;
+ /*
+ * Another alternative is to
+ * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT;
+ * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT;
+ * to retry from bucket of the same size.
+ */
+ can_retry = false;
+ goto retry;
}
success:
@@ -7334,10 +7252,6 @@ void __kmem_cache_release(struct kmem_cache *s)
cache_random_seq_destroy(s);
if (s->cpu_sheaves)
pcs_destroy(s);
-#ifdef CONFIG_PREEMPT_RT
- if (s->cpu_slab)
- lockdep_unregister_key(&s->lock_key);
-#endif
free_percpu(s->cpu_slab);
free_kmem_cache_nodes(s);
}
--
2.52.0
On Fri, Jan 16, 2026 at 03:40:34PM +0100, Vlastimil Babka wrote:
> The kmalloc_nolock() implementation has several complications and
> restrictions due to SLUB's cpu slab locking, lockless fastpath and
> PREEMPT_RT differences. With cpu slab usage removed, we can simplify
> things:
>
> - relax the PREEMPT_RT context checks as they were before commit
> a4ae75d1b6a2 ("slab: fix kmalloc_nolock() context check for
> PREEMPT_RT") and also reference the explanation comment in the page
> allocator
>
> - the local_lock_cpu_slab() macros became unused, remove them
>
> - we no longer need to set up lockdep classes on PREEMPT_RT
>
> - we no longer need to annotate ___slab_alloc as NOKPROBE_SYMBOL
> since there's no lockless cpu freelist manipulation anymore
>
> - __slab_alloc_node() can be called from kmalloc_nolock_noprof()
> unconditionally. It can also no longer return EBUSY. But trylock
> failures can still happen so retry with the larger bucket if the
> allocation fails for any reason.
>
> Note that we still need __CMPXCHG_DOUBLE, because while it was removed
> we don't use cmpxchg16b on cpu freelist anymore, we still use it on
> slab freelist, and the alternative is slab_lock() which can be
> interrupted by a nmi. Clarify the comment to mention it specifically.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
What a nice cleanup!
Looks good to me,
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
with a nit below.
> mm/slab.h | 1 -
> mm/slub.c | 144 +++++++++++++-------------------------------------------------
> 2 files changed, 29 insertions(+), 116 deletions(-)
>
> diff --git a/mm/slab.h b/mm/slab.h
> index 4efec41b6445..e9a0738133ed 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -5268,10 +5196,11 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
> if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
> /*
> * kmalloc_nolock() is not supported on architectures that
> - * don't implement cmpxchg16b, but debug caches don't use
> - * per-cpu slab and per-cpu partial slabs. They rely on
> - * kmem_cache_node->list_lock, so kmalloc_nolock() can
> - * attempt to allocate from debug caches by
> + * don't implement cmpxchg16b and thus need slab_lock()
> + * which could be preempted by a nmi.
nit: I think now this limitation can be removed because the only slab
lock used in the allocation path is get_partial_node() ->
__slab_update_freelist(), but it is always used under n->list_lock.
Being preempted by a NMI while holding the slab lock is fine because
NMI context should fail to acquire n->list_lock and bail out.
But no hurry on this, it's probably not important enough to delay
this series :)
> + * But debug caches don't use that and only rely on
> + * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt
> + * to allocate from debug caches by
> * spin_trylock_irqsave(&n->list_lock, ...)
> */
> return NULL;
>
--
Cheers,
Harry / Hyeonggon
On 1/22/26 02:53, Harry Yoo wrote:
> On Fri, Jan 16, 2026 at 03:40:34PM +0100, Vlastimil Babka wrote:
>> The kmalloc_nolock() implementation has several complications and
>> restrictions due to SLUB's cpu slab locking, lockless fastpath and
>> PREEMPT_RT differences. With cpu slab usage removed, we can simplify
>> things:
>>
>> - relax the PREEMPT_RT context checks as they were before commit
>> a4ae75d1b6a2 ("slab: fix kmalloc_nolock() context check for
>> PREEMPT_RT") and also reference the explanation comment in the page
>> allocator
>>
>> - the local_lock_cpu_slab() macros became unused, remove them
>>
>> - we no longer need to set up lockdep classes on PREEMPT_RT
>>
>> - we no longer need to annotate ___slab_alloc as NOKPROBE_SYMBOL
>> since there's no lockless cpu freelist manipulation anymore
>>
>> - __slab_alloc_node() can be called from kmalloc_nolock_noprof()
>> unconditionally. It can also no longer return EBUSY. But trylock
>> failures can still happen so retry with the larger bucket if the
>> allocation fails for any reason.
>>
>> Note that we still need __CMPXCHG_DOUBLE, because while it was removed
>> we don't use cmpxchg16b on cpu freelist anymore, we still use it on
>> slab freelist, and the alternative is slab_lock() which can be
>> interrupted by a nmi. Clarify the comment to mention it specifically.
>>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> ---
>
> What a nice cleanup!
>
> Looks good to me,
> Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Thanks!
> with a nit below.
>
>> mm/slab.h | 1 -
>> mm/slub.c | 144 +++++++++++++-------------------------------------------------
>> 2 files changed, 29 insertions(+), 116 deletions(-)
>>
>> diff --git a/mm/slab.h b/mm/slab.h
>> index 4efec41b6445..e9a0738133ed 100644
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -5268,10 +5196,11 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
>> if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
>> /*
>> * kmalloc_nolock() is not supported on architectures that
>> - * don't implement cmpxchg16b, but debug caches don't use
>> - * per-cpu slab and per-cpu partial slabs. They rely on
>> - * kmem_cache_node->list_lock, so kmalloc_nolock() can
>> - * attempt to allocate from debug caches by
>> + * don't implement cmpxchg16b and thus need slab_lock()
>> + * which could be preempted by a nmi.
>
> nit: I think now this limitation can be removed because the only slab
> lock used in the allocation path is get_partial_node() ->
> __slab_update_freelist(), but it is always used under n->list_lock.
>
> Being preempted by a NMI while holding the slab lock is fine because
> NMI context should fail to acquire n->list_lock and bail out.
Hmm but somebody might be freeing with __slab_free() without taking the
n->list_lock (slab is on partial list and expected to remain there after the
free), then there's a NMI and the allocation can take n->list_lock fine?
> But no hurry on this, it's probably not important enough to delay
> this series :)
>
>> + * But debug caches don't use that and only rely on
>> + * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt
>> + * to allocate from debug caches by
>> * spin_trylock_irqsave(&n->list_lock, ...)
>> */
>> return NULL;
>>
>
On Thu, Jan 22, 2026 at 09:16:04AM +0100, Vlastimil Babka wrote: > On 1/22/26 02:53, Harry Yoo wrote: > > On Fri, Jan 16, 2026 at 03:40:34PM +0100, Vlastimil Babka wrote: > >> if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) > >> /* > >> * kmalloc_nolock() is not supported on architectures that > >> - * don't implement cmpxchg16b, but debug caches don't use > >> - * per-cpu slab and per-cpu partial slabs. They rely on > >> - * kmem_cache_node->list_lock, so kmalloc_nolock() can > >> - * attempt to allocate from debug caches by > >> + * don't implement cmpxchg16b and thus need slab_lock() > >> + * which could be preempted by a nmi. > > > > nit: I think now this limitation can be removed because the only slab > > lock used in the allocation path is get_partial_node() -> > > __slab_update_freelist(), but it is always used under n->list_lock. > > > > Being preempted by a NMI while holding the slab lock is fine because > > NMI context should fail to acquire n->list_lock and bail out. > > Hmm but somebody might be freeing with __slab_free() without taking the > n->list_lock (slab is on partial list and expected to remain there after the > free), then there's a NMI and the allocation can take n->list_lock fine? Oops, you're right. Never mind. Concurrency is tricky :) -- Cheers, Harry / Hyeonggon
On Fri, Jan 16, 2026 at 03:40:34PM +0100, Vlastimil Babka wrote:
> The kmalloc_nolock() implementation has several complications and
> restrictions due to SLUB's cpu slab locking, lockless fastpath and
> PREEMPT_RT differences. With cpu slab usage removed, we can simplify
> things:
>
> - relax the PREEMPT_RT context checks as they were before commit
> a4ae75d1b6a2 ("slab: fix kmalloc_nolock() context check for
> PREEMPT_RT") and also reference the explanation comment in the page
> allocator
>
> - the local_lock_cpu_slab() macros became unused, remove them
>
> - we no longer need to set up lockdep classes on PREEMPT_RT
>
> - we no longer need to annotate ___slab_alloc as NOKPROBE_SYMBOL
> since there's no lockless cpu freelist manipulation anymore
>
> - __slab_alloc_node() can be called from kmalloc_nolock_noprof()
> unconditionally. It can also no longer return EBUSY. But trylock
> failures can still happen so retry with the larger bucket if the
> allocation fails for any reason.
>
> Note that we still need __CMPXCHG_DOUBLE, because while it was removed
> we don't use cmpxchg16b on cpu freelist anymore, we still use it on
> slab freelist, and the alternative is slab_lock() which can be
> interrupted by a nmi. Clarify the comment to mention it specifically.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
> mm/slab.h | 1 -
> mm/slub.c | 144 +++++++++++++-------------------------------------------------
> 2 files changed, 29 insertions(+), 116 deletions(-)
>
Looks good to me.
Reviewed-by: Hao Li <hao.li@linux.dev>
--
Thanks,
Hao
On Tue, Jan 20, 2026 at 12:07 PM Hao Li <hao.li@linux.dev> wrote:
>
> On Fri, Jan 16, 2026 at 03:40:34PM +0100, Vlastimil Babka wrote:
> > The kmalloc_nolock() implementation has several complications and
> > restrictions due to SLUB's cpu slab locking, lockless fastpath and
> > PREEMPT_RT differences. With cpu slab usage removed, we can simplify
> > things:
> >
> > - relax the PREEMPT_RT context checks as they were before commit
> > a4ae75d1b6a2 ("slab: fix kmalloc_nolock() context check for
> > PREEMPT_RT") and also reference the explanation comment in the page
> > allocator
> >
> > - the local_lock_cpu_slab() macros became unused, remove them
> >
> > - we no longer need to set up lockdep classes on PREEMPT_RT
> >
> > - we no longer need to annotate ___slab_alloc as NOKPROBE_SYMBOL
> > since there's no lockless cpu freelist manipulation anymore
> >
> > - __slab_alloc_node() can be called from kmalloc_nolock_noprof()
> > unconditionally. It can also no longer return EBUSY. But trylock
> > failures can still happen so retry with the larger bucket if the
> > allocation fails for any reason.
> >
> > Note that we still need __CMPXCHG_DOUBLE, because while it was removed
> > we don't use cmpxchg16b on cpu freelist anymore, we still use it on
> > slab freelist, and the alternative is slab_lock() which can be
> > interrupted by a nmi. Clarify the comment to mention it specifically.
> >
> > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> > ---
> > mm/slab.h | 1 -
> > mm/slub.c | 144 +++++++++++++-------------------------------------------------
> > 2 files changed, 29 insertions(+), 116 deletions(-)
> >
>
> Looks good to me.
> Reviewed-by: Hao Li <hao.li@linux.dev>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
>
> --
> Thanks,
> Hao
© 2016 - 2026 Red Hat, Inc.