Before we enable percpu sheaves for kmalloc caches, we need to make sure
kmalloc_nolock() and kfree_nolock() will continue working properly and
not spin when not allowed to.
Percpu sheaves themselves use local_trylock() so they are already
compatible. We just need to be careful with the barn->lock spin_lock.
Pass a new allow_spin parameter where necessary to use
spin_trylock_irqsave().
In kmalloc_nolock_noprof() we can now attempt alloc_from_pcs() safely,
for now it will always fail until we enable sheaves for kmalloc caches
next. Similarly in kfree_nolock() we can attempt free_to_pcs().
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
mm/slub.c | 74 ++++++++++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 52 insertions(+), 22 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c
index ecb10ed5acfe..5d0b2cf66520 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2876,7 +2876,8 @@ static void pcs_destroy(struct kmem_cache *s)
s->cpu_sheaves = NULL;
}
-static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn)
+static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn,
+ bool allow_spin)
{
struct slab_sheaf *empty = NULL;
unsigned long flags;
@@ -2884,7 +2885,10 @@ static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn)
if (!data_race(barn->nr_empty))
return NULL;
- spin_lock_irqsave(&barn->lock, flags);
+ if (likely(allow_spin))
+ spin_lock_irqsave(&barn->lock, flags);
+ else if (!spin_trylock_irqsave(&barn->lock, flags))
+ return NULL;
if (likely(barn->nr_empty)) {
empty = list_first_entry(&barn->sheaves_empty,
@@ -2961,7 +2965,8 @@ static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn)
* change.
*/
static struct slab_sheaf *
-barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
+barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty,
+ bool allow_spin)
{
struct slab_sheaf *full = NULL;
unsigned long flags;
@@ -2969,7 +2974,10 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
if (!data_race(barn->nr_full))
return NULL;
- spin_lock_irqsave(&barn->lock, flags);
+ if (likely(allow_spin))
+ spin_lock_irqsave(&barn->lock, flags);
+ else if (!spin_trylock_irqsave(&barn->lock, flags))
+ return NULL;
if (likely(barn->nr_full)) {
full = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
@@ -2990,7 +2998,8 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
* barn. But if there are too many full sheaves, reject this with -E2BIG.
*/
static struct slab_sheaf *
-barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
+barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full,
+ bool allow_spin)
{
struct slab_sheaf *empty;
unsigned long flags;
@@ -3001,7 +3010,10 @@ barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
if (!data_race(barn->nr_empty))
return ERR_PTR(-ENOMEM);
- spin_lock_irqsave(&barn->lock, flags);
+ if (likely(allow_spin))
+ spin_lock_irqsave(&barn->lock, flags);
+ else if (!spin_trylock_irqsave(&barn->lock, flags))
+ return NULL;
if (likely(barn->nr_empty)) {
empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf,
@@ -5000,7 +5012,8 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
return NULL;
}
- full = barn_replace_empty_sheaf(barn, pcs->main);
+ full = barn_replace_empty_sheaf(barn, pcs->main,
+ gfpflags_allow_spinning(gfp));
if (full) {
stat(s, BARN_GET);
@@ -5017,7 +5030,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
empty = pcs->spare;
pcs->spare = NULL;
} else {
- empty = barn_get_empty_sheaf(barn);
+ empty = barn_get_empty_sheaf(barn, true);
}
}
@@ -5154,7 +5167,8 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)
}
static __fastpath_inline
-unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
+unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, gfp_t gfp, size_t size,
+ void **p)
{
struct slub_percpu_sheaves *pcs;
struct slab_sheaf *main;
@@ -5188,7 +5202,8 @@ unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
return allocated;
}
- full = barn_replace_empty_sheaf(barn, pcs->main);
+ full = barn_replace_empty_sheaf(barn, pcs->main,
+ gfpflags_allow_spinning(gfp));
if (full) {
stat(s, BARN_GET);
@@ -5693,7 +5708,7 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
struct kmem_cache *s;
bool can_retry = true;
- void *ret = ERR_PTR(-EBUSY);
+ void *ret;
VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
__GFP_NO_OBJ_EXT));
@@ -5720,6 +5735,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
*/
return NULL;
+ ret = alloc_from_pcs(s, alloc_gfp, node);
+
+ if (ret)
+ goto success;
+
+ ret = ERR_PTR(-EBUSY);
+
/*
* Do not call slab_alloc_node(), since trylock mode isn't
* compatible with slab_pre_alloc_hook/should_failslab and
@@ -5756,6 +5778,7 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
ret = NULL;
}
+success:
maybe_wipe_obj_freeptr(s, ret);
slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
slab_want_init_on_alloc(alloc_gfp, s), size);
@@ -6047,7 +6070,8 @@ static void __pcs_install_empty_sheaf(struct kmem_cache *s,
* unlocked.
*/
static struct slub_percpu_sheaves *
-__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
+__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
+ bool allow_spin)
{
struct slab_sheaf *empty;
struct node_barn *barn;
@@ -6071,7 +6095,7 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
put_fail = false;
if (!pcs->spare) {
- empty = barn_get_empty_sheaf(barn);
+ empty = barn_get_empty_sheaf(barn, allow_spin);
if (empty) {
pcs->spare = pcs->main;
pcs->main = empty;
@@ -6085,7 +6109,7 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
return pcs;
}
- empty = barn_replace_full_sheaf(barn, pcs->main);
+ empty = barn_replace_full_sheaf(barn, pcs->main, allow_spin);
if (!IS_ERR(empty)) {
stat(s, BARN_PUT);
@@ -6093,6 +6117,11 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
return pcs;
}
+ if (!allow_spin) {
+ local_unlock(&s->cpu_sheaves->lock);
+ return NULL;
+ }
+
if (PTR_ERR(empty) == -E2BIG) {
/* Since we got here, spare exists and is full */
struct slab_sheaf *to_flush = pcs->spare;
@@ -6160,7 +6189,7 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
* The object is expected to have passed slab_free_hook() already.
*/
static __fastpath_inline
-bool free_to_pcs(struct kmem_cache *s, void *object)
+bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin)
{
struct slub_percpu_sheaves *pcs;
@@ -6171,7 +6200,7 @@ bool free_to_pcs(struct kmem_cache *s, void *object)
if (unlikely(pcs->main->size == s->sheaf_capacity)) {
- pcs = __pcs_replace_full_main(s, pcs);
+ pcs = __pcs_replace_full_main(s, pcs, allow_spin);
if (unlikely(!pcs))
return false;
}
@@ -6278,7 +6307,7 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
goto fail;
}
- empty = barn_get_empty_sheaf(barn);
+ empty = barn_get_empty_sheaf(barn, true);
if (empty) {
pcs->rcu_free = empty;
@@ -6398,7 +6427,7 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
goto no_empty;
if (!pcs->spare) {
- empty = barn_get_empty_sheaf(barn);
+ empty = barn_get_empty_sheaf(barn, true);
if (!empty)
goto no_empty;
@@ -6412,7 +6441,7 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
goto do_free;
}
- empty = barn_replace_full_sheaf(barn, pcs->main);
+ empty = barn_replace_full_sheaf(barn, pcs->main, true);
if (IS_ERR(empty)) {
stat(s, BARN_PUT_FAIL);
goto no_empty;
@@ -6659,7 +6688,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())
&& likely(!slab_test_pfmemalloc(slab))) {
- if (likely(free_to_pcs(s, object)))
+ if (likely(free_to_pcs(s, object, true)))
return;
}
@@ -6922,7 +6951,8 @@ void kfree_nolock(const void *object)
* since kasan quarantine takes locks and not supported from NMI.
*/
kasan_slab_free(s, x, false, false, /* skip quarantine */true);
- do_slab_free(s, slab, x, x, 0, _RET_IP_);
+ if (!free_to_pcs(s, x, false))
+ do_slab_free(s, slab, x, x, 0, _RET_IP_);
}
EXPORT_SYMBOL_GPL(kfree_nolock);
@@ -7465,7 +7495,7 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
size--;
}
- i = alloc_from_pcs_bulk(s, size, p);
+ i = alloc_from_pcs_bulk(s, flags, size, p);
if (i < size) {
/*
--
2.51.1
On Thu, Oct 23, 2025 at 6:53 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> Before we enable percpu sheaves for kmalloc caches, we need to make sure
> kmalloc_nolock() and kfree_nolock() will continue working properly and
> not spin when not allowed to.
>
> Percpu sheaves themselves use local_trylock() so they are already
> compatible. We just need to be careful with the barn->lock spin_lock.
> Pass a new allow_spin parameter where necessary to use
> spin_trylock_irqsave().
>
> In kmalloc_nolock_noprof() we can now attempt alloc_from_pcs() safely,
> for now it will always fail until we enable sheaves for kmalloc caches
> next. Similarly in kfree_nolock() we can attempt free_to_pcs().
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
> mm/slub.c | 74 ++++++++++++++++++++++++++++++++++++++++++++-------------------
> 1 file changed, 52 insertions(+), 22 deletions(-)
>
> diff --git a/mm/slub.c b/mm/slub.c
> index ecb10ed5acfe..5d0b2cf66520 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -2876,7 +2876,8 @@ static void pcs_destroy(struct kmem_cache *s)
> s->cpu_sheaves = NULL;
> }
>
> -static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn)
> +static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn,
> + bool allow_spin)
> {
> struct slab_sheaf *empty = NULL;
> unsigned long flags;
> @@ -2884,7 +2885,10 @@ static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn)
> if (!data_race(barn->nr_empty))
> return NULL;
>
> - spin_lock_irqsave(&barn->lock, flags);
> + if (likely(allow_spin))
> + spin_lock_irqsave(&barn->lock, flags);
> + else if (!spin_trylock_irqsave(&barn->lock, flags))
> + return NULL;
>
> if (likely(barn->nr_empty)) {
> empty = list_first_entry(&barn->sheaves_empty,
> @@ -2961,7 +2965,8 @@ static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn)
> * change.
> */
> static struct slab_sheaf *
> -barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
> +barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty,
> + bool allow_spin)
> {
> struct slab_sheaf *full = NULL;
> unsigned long flags;
> @@ -2969,7 +2974,10 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
> if (!data_race(barn->nr_full))
> return NULL;
>
> - spin_lock_irqsave(&barn->lock, flags);
> + if (likely(allow_spin))
> + spin_lock_irqsave(&barn->lock, flags);
> + else if (!spin_trylock_irqsave(&barn->lock, flags))
> + return NULL;
>
> if (likely(barn->nr_full)) {
> full = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
> @@ -2990,7 +2998,8 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
> * barn. But if there are too many full sheaves, reject this with -E2BIG.
> */
> static struct slab_sheaf *
> -barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
> +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full,
> + bool allow_spin)
> {
> struct slab_sheaf *empty;
> unsigned long flags;
> @@ -3001,7 +3010,10 @@ barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
> if (!data_race(barn->nr_empty))
> return ERR_PTR(-ENOMEM);
>
> - spin_lock_irqsave(&barn->lock, flags);
> + if (likely(allow_spin))
> + spin_lock_irqsave(&barn->lock, flags);
> + else if (!spin_trylock_irqsave(&barn->lock, flags))
> + return NULL;
AI did a good job here. I spent an hour staring at the patch
for other reasons. Noticed this bug too and then went
"ohh, wait, AI mentioned it already". Time to retire.
> if (likely(barn->nr_empty)) {
> empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf,
> @@ -5000,7 +5012,8 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
> return NULL;
> }
>
> - full = barn_replace_empty_sheaf(barn, pcs->main);
> + full = barn_replace_empty_sheaf(barn, pcs->main,
> + gfpflags_allow_spinning(gfp));
>
> if (full) {
> stat(s, BARN_GET);
> @@ -5017,7 +5030,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
> empty = pcs->spare;
> pcs->spare = NULL;
> } else {
> - empty = barn_get_empty_sheaf(barn);
> + empty = barn_get_empty_sheaf(barn, true);
> }
> }
>
> @@ -5154,7 +5167,8 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)
> }
>
> static __fastpath_inline
> -unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
> +unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, gfp_t gfp, size_t size,
> + void **p)
> {
> struct slub_percpu_sheaves *pcs;
> struct slab_sheaf *main;
> @@ -5188,7 +5202,8 @@ unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
> return allocated;
> }
>
> - full = barn_replace_empty_sheaf(barn, pcs->main);
> + full = barn_replace_empty_sheaf(barn, pcs->main,
> + gfpflags_allow_spinning(gfp));
>
> if (full) {
> stat(s, BARN_GET);
> @@ -5693,7 +5708,7 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
> gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
> struct kmem_cache *s;
> bool can_retry = true;
> - void *ret = ERR_PTR(-EBUSY);
> + void *ret;
>
> VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
> __GFP_NO_OBJ_EXT));
> @@ -5720,6 +5735,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
> */
> return NULL;
>
> + ret = alloc_from_pcs(s, alloc_gfp, node);
> +
I would remove the empty line here.
> + if (ret)
> + goto success;
> +
> + ret = ERR_PTR(-EBUSY);
> +
> /*
> * Do not call slab_alloc_node(), since trylock mode isn't
> * compatible with slab_pre_alloc_hook/should_failslab and
> @@ -5756,6 +5778,7 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
> ret = NULL;
> }
>
> +success:
> maybe_wipe_obj_freeptr(s, ret);
> slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
> slab_want_init_on_alloc(alloc_gfp, s), size);
> @@ -6047,7 +6070,8 @@ static void __pcs_install_empty_sheaf(struct kmem_cache *s,
> * unlocked.
> */
> static struct slub_percpu_sheaves *
> -__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
> +__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
> + bool allow_spin)
> {
> struct slab_sheaf *empty;
> struct node_barn *barn;
> @@ -6071,7 +6095,7 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
> put_fail = false;
>
> if (!pcs->spare) {
> - empty = barn_get_empty_sheaf(barn);
> + empty = barn_get_empty_sheaf(barn, allow_spin);
> if (empty) {
> pcs->spare = pcs->main;
> pcs->main = empty;
> @@ -6085,7 +6109,7 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
> return pcs;
> }
>
> - empty = barn_replace_full_sheaf(barn, pcs->main);
> + empty = barn_replace_full_sheaf(barn, pcs->main, allow_spin);
>
> if (!IS_ERR(empty)) {
> stat(s, BARN_PUT);
> @@ -6093,6 +6117,11 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
> return pcs;
> }
>
> + if (!allow_spin) {
> + local_unlock(&s->cpu_sheaves->lock);
> + return NULL;
> + }
and would add a comment here to elaborate that the next
steps like sheaf_flush_unused() and alloc_empty_sheaf()
cannot handle !allow_spin.
> +
> if (PTR_ERR(empty) == -E2BIG) {
> /* Since we got here, spare exists and is full */
> struct slab_sheaf *to_flush = pcs->spare;
> @@ -6160,7 +6189,7 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
> * The object is expected to have passed slab_free_hook() already.
> */
> static __fastpath_inline
> -bool free_to_pcs(struct kmem_cache *s, void *object)
> +bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin)
> {
> struct slub_percpu_sheaves *pcs;
>
> @@ -6171,7 +6200,7 @@ bool free_to_pcs(struct kmem_cache *s, void *object)
>
> if (unlikely(pcs->main->size == s->sheaf_capacity)) {
>
> - pcs = __pcs_replace_full_main(s, pcs);
> + pcs = __pcs_replace_full_main(s, pcs, allow_spin);
> if (unlikely(!pcs))
> return false;
> }
> @@ -6278,7 +6307,7 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
> goto fail;
> }
>
> - empty = barn_get_empty_sheaf(barn);
> + empty = barn_get_empty_sheaf(barn, true);
>
> if (empty) {
> pcs->rcu_free = empty;
> @@ -6398,7 +6427,7 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
> goto no_empty;
>
> if (!pcs->spare) {
> - empty = barn_get_empty_sheaf(barn);
> + empty = barn_get_empty_sheaf(barn, true);
I'm allergic to booleans in arguments. They make callsites
hard to read. Especially if there are multiple bools.
We have horrendous lines in the verifier that we still need
to clean up due to bools:
check_load_mem(env, insn, true, false, false, "atomic_load");
barn_get_empty_sheaf(barn, true); looks benign,
but I would still use enum { DONT_SPIN, ALLOW_SPIN }
and use that in all functions instead of 'bool allow_spin'.
Aside from that I got worried that sheaves fast path
may be not optimized well by the compiler:
if (unlikely(pcs->main->size == 0)) ...
object = pcs->main->objects[pcs->main->size - 1];
// object is accessed here
pcs->main->size--;
since object may alias into pcs->main and the compiler
may be tempted to reload 'main'.
Looks like it's fine, since object point is not actually read or written.
gcc15 asm looks good:
movq 8(%rbx), %rdx # _68->main, _69
movl 24(%rdx), %eax # _69->size, _70
# ../mm/slub.c:5129: if (unlikely(pcs->main->size == 0)) {
testl %eax, %eax # _70
je .L2076 #,
.L1953:
# ../mm/slub.c:5135: object = pcs->main->objects[pcs->main->size - 1];
leal -1(%rax), %esi #,
# ../mm/slub.c:5135: object = pcs->main->objects[pcs->main->size - 1];
movq 32(%rdx,%rsi,8), %rdi # prephitmp_309->objects[_81], object
# ../mm/slub.c:5135: object = pcs->main->objects[pcs->main->size - 1];
movq %rsi, %rax #,
# ../mm/slub.c:5137: if (unlikely(node_requested)) {
testb %r15b, %r15b # node_requested
jne .L2077 #,
.L1954:
# ../mm/slub.c:5149: pcs->main->size--;
movl %eax, 24(%rdx) # _81, prephitmp_30->size
On 10/24/25 21:43, Alexei Starovoitov wrote:
> On Thu, Oct 23, 2025 at 6:53 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>>
>> Before we enable percpu sheaves for kmalloc caches, we need to make sure
>> kmalloc_nolock() and kfree_nolock() will continue working properly and
>> not spin when not allowed to.
>>
>> Percpu sheaves themselves use local_trylock() so they are already
>> compatible. We just need to be careful with the barn->lock spin_lock.
>> Pass a new allow_spin parameter where necessary to use
>> spin_trylock_irqsave().
>>
>> In kmalloc_nolock_noprof() we can now attempt alloc_from_pcs() safely,
>> for now it will always fail until we enable sheaves for kmalloc caches
>> next. Similarly in kfree_nolock() we can attempt free_to_pcs().
>>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
...>> @@ -5720,6 +5735,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t
gfp_flags, int node)
>> */
>> return NULL;
>>
>> + ret = alloc_from_pcs(s, alloc_gfp, node);
>> +
>
> I would remove the empty line here.
Ack.
>> @@ -6093,6 +6117,11 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
>> return pcs;
>> }
>>
>> + if (!allow_spin) {
>> + local_unlock(&s->cpu_sheaves->lock);
>> + return NULL;
>> + }
>
> and would add a comment here to elaborate that the next
> steps like sheaf_flush_unused() and alloc_empty_sheaf()
> cannot handle !allow_spin.
Will do.
>> +
>> if (PTR_ERR(empty) == -E2BIG) {
>> /* Since we got here, spare exists and is full */
>> struct slab_sheaf *to_flush = pcs->spare;
>> @@ -6160,7 +6189,7 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
>> * The object is expected to have passed slab_free_hook() already.
>> */
>> static __fastpath_inline
>> -bool free_to_pcs(struct kmem_cache *s, void *object)
>> +bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin)
>> {
>> struct slub_percpu_sheaves *pcs;
>>
>> @@ -6171,7 +6200,7 @@ bool free_to_pcs(struct kmem_cache *s, void *object)
>>
>> if (unlikely(pcs->main->size == s->sheaf_capacity)) {
>>
>> - pcs = __pcs_replace_full_main(s, pcs);
>> + pcs = __pcs_replace_full_main(s, pcs, allow_spin);
>> if (unlikely(!pcs))
>> return false;
>> }
>> @@ -6278,7 +6307,7 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
>> goto fail;
>> }
>>
>> - empty = barn_get_empty_sheaf(barn);
>> + empty = barn_get_empty_sheaf(barn, true);
>>
>> if (empty) {
>> pcs->rcu_free = empty;
>> @@ -6398,7 +6427,7 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
>> goto no_empty;
>>
>> if (!pcs->spare) {
>> - empty = barn_get_empty_sheaf(barn);
>> + empty = barn_get_empty_sheaf(barn, true);
>
> I'm allergic to booleans in arguments. They make callsites
> hard to read. Especially if there are multiple bools.
> We have horrendous lines in the verifier that we still need
> to clean up due to bools:
> check_load_mem(env, insn, true, false, false, "atomic_load");
>
> barn_get_empty_sheaf(barn, true); looks benign,
> but I would still use enum { DONT_SPIN, ALLOW_SPIN }
> and use that in all functions instead of 'bool allow_spin'.
I'll put it on the TODO list. But I think it's just following the pattern of
what you did in all the work leading to kmalloc_nolock() :)
And it's a single bool and for internal function with limited exposure, so
might be an overkill. Will see.
> Aside from that I got worried that sheaves fast path
> may be not optimized well by the compiler:
> if (unlikely(pcs->main->size == 0)) ...
> object = pcs->main->objects[pcs->main->size - 1];
> // object is accessed here
only by virt_to_folio() which takes a const void *x and is probably inlined
anyway...
> pcs->main->size--;
>
> since object may alias into pcs->main and the compiler
> may be tempted to reload 'main'.
Interesting, it wouldn't have thought about the possibility.
> Looks like it's fine, since object point is not actually read or written.
Wonder if it figures that out or just assumes it would be an undefined
behavior (or would we need strict aliasing to allow the assumption?). But
good to know it looks ok, thanks!
> gcc15 asm looks good:
> movq 8(%rbx), %rdx # _68->main, _69
> movl 24(%rdx), %eax # _69->size, _70
> # ../mm/slub.c:5129: if (unlikely(pcs->main->size == 0)) {
> testl %eax, %eax # _70
> je .L2076 #,
> .L1953:
> # ../mm/slub.c:5135: object = pcs->main->objects[pcs->main->size - 1];
> leal -1(%rax), %esi #,
> # ../mm/slub.c:5135: object = pcs->main->objects[pcs->main->size - 1];
> movq 32(%rdx,%rsi,8), %rdi # prephitmp_309->objects[_81], object
> # ../mm/slub.c:5135: object = pcs->main->objects[pcs->main->size - 1];
> movq %rsi, %rax #,
> # ../mm/slub.c:5137: if (unlikely(node_requested)) {
> testb %r15b, %r15b # node_requested
> jne .L2077 #,
> .L1954:
> # ../mm/slub.c:5149: pcs->main->size--;
> movl %eax, 24(%rdx) # _81, prephitmp_30->size
On Thu, 23 Oct 2025 15:52:29 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
> Before we enable percpu sheaves for kmalloc caches, we need to make sure
> kmalloc_nolock() and kfree_nolock() will continue working properly and
> not spin when not allowed to.
>
> Percpu sheaves themselves use local_trylock() so they are already
> compatible. We just need to be careful with the barn->lock spin_lock.
> Pass a new allow_spin parameter where necessary to use
> spin_trylock_irqsave().
>
> In kmalloc_nolock_noprof() we can now attempt alloc_from_pcs() safely,
> for now it will always fail until we enable sheaves for kmalloc caches
> next. Similarly in kfree_nolock() we can attempt free_to_pcs().
>
Hi Vlastimil,
We're trying out the AI patch review automation on the BPF commits and it had
some questions about a few of these. Since the goal is to actually test the
automation, I'm putting them in unedited, but I did try and make sure they
were valid before sending.
This one is a little verbose for s/NULL/ERR_PTR/ suggestions, but:
> diff --git a/mm/slub.c b/mm/slub.c
> index ecb10ed5acfe..5d0b2cf66520 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
[ ... ]
> @@ -2990,7 +2998,8 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
> * barn. But if there are too many full sheaves, reject this with -E2BIG.
> */
> static struct slab_sheaf *
> -barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
> +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full,
> + bool allow_spin)
> {
> struct slab_sheaf *empty;
> unsigned long flags;
> @@ -3001,7 +3010,10 @@ barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
> if (!data_race(barn->nr_empty))
> return ERR_PTR(-ENOMEM);
>
> - spin_lock_irqsave(&barn->lock, flags);
> + if (likely(allow_spin))
> + spin_lock_irqsave(&barn->lock, flags);
> + else if (!spin_trylock_irqsave(&barn->lock, flags))
> + return NULL;
barn_replace_full_sheaf() now returns NULL when the trylock fails, in
addition to its existing ERR_PTR(-ENOMEM) and ERR_PTR(-E2BIG) returns.
[ ... ]
> @@ -6047,7 +6070,8 @@ static void __pcs_install_empty_sheaf(struct kmem_cache *s,
> * unlocked.
> */
> static struct slub_percpu_sheaves *
> -__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
> +__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
> + bool allow_spin)
> {
> struct slab_sheaf *empty;
> struct node_barn *barn;
> @@ -6071,7 +6095,7 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
> put_fail = false;
>
> if (!pcs->spare) {
> - empty = barn_get_empty_sheaf(barn);
> + empty = barn_get_empty_sheaf(barn, allow_spin);
> if (empty) {
> pcs->spare = pcs->main;
> pcs->main = empty;
> @@ -6085,7 +6109,7 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
> return pcs;
> }
>
> - empty = barn_replace_full_sheaf(barn, pcs->main);
> + empty = barn_replace_full_sheaf(barn, pcs->main, allow_spin);
>
> if (!IS_ERR(empty)) {
> stat(s, BARN_PUT);
> pcs->main = empty;
> return pcs;
> }
Can this set pcs->main to NULL? When allow_spin=false and the trylock
fails, barn_replace_full_sheaf() returns NULL. Since !IS_ERR(NULL) is
true, the code executes pcs->main = NULL and returns pcs. This violates
the documented invariant that main is "never NULL when unlocked" (from
struct slub_percpu_sheaves comment at mm/slub.c:481).
The caller free_to_pcs() then dereferences pcs->main->objects at
mm/slub.c:6208, which would be a NULL pointer dereference:
pcs->main->objects[pcs->main->size++] = object;
The execution path is:
1. kfree_nolock() calls free_to_pcs() with allow_spin=false
2. free_to_pcs() calls __pcs_replace_full_main() with allow_spin=false
3. pcs->spare exists and is full (size == s->sheaf_capacity)
4. barn_replace_full_sheaf() trylock fails, returns NULL
5. !IS_ERR(NULL) = true, so pcs->main = NULL is executed
6. __pcs_replace_full_main() returns pcs with NULL main
7. free_to_pcs() dereferences pcs->main->objects causing NULL deref
Should the check be if (empty && !IS_ERR(empty)) to properly handle the
new NULL return case?
> @@ -6093,6 +6117,11 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
> return pcs;
> }
>
> + if (!allow_spin) {
> + local_unlock(&s->cpu_sheaves->lock);
> + return NULL;
> + }
This check is never reached when barn_replace_full_sheaf() returns NULL,
because the early return at the !IS_ERR(empty) check above executes first.
[ ... ]
On 10/24/25 16:04, Chris Mason wrote:
> On Thu, 23 Oct 2025 15:52:29 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
>
>> Before we enable percpu sheaves for kmalloc caches, we need to make sure
>> kmalloc_nolock() and kfree_nolock() will continue working properly and
>> not spin when not allowed to.
>>
>> Percpu sheaves themselves use local_trylock() so they are already
>> compatible. We just need to be careful with the barn->lock spin_lock.
>> Pass a new allow_spin parameter where necessary to use
>> spin_trylock_irqsave().
>>
>> In kmalloc_nolock_noprof() we can now attempt alloc_from_pcs() safely,
>> for now it will always fail until we enable sheaves for kmalloc caches
>> next. Similarly in kfree_nolock() we can attempt free_to_pcs().
>>
>
> Hi Vlastimil,
>
> We're trying out the AI patch review automation on the BPF commits and it had
> some questions about a few of these. Since the goal is to actually test the
> automation, I'm putting them in unedited, but I did try and make sure they
> were valid before sending.
>
> This one is a little verbose for s/NULL/ERR_PTR/ suggestions, but:
>
>> diff --git a/mm/slub.c b/mm/slub.c
>> index ecb10ed5acfe..5d0b2cf66520 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>
> [ ... ]
>
>> @@ -2990,7 +2998,8 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
>> * barn. But if there are too many full sheaves, reject this with -E2BIG.
>> */
>> static struct slab_sheaf *
>> -barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
>> +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full,
>> + bool allow_spin)
>> {
>> struct slab_sheaf *empty;
>> unsigned long flags;
>> @@ -3001,7 +3010,10 @@ barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
>> if (!data_race(barn->nr_empty))
>> return ERR_PTR(-ENOMEM);
>>
>> - spin_lock_irqsave(&barn->lock, flags);
>> + if (likely(allow_spin))
>> + spin_lock_irqsave(&barn->lock, flags);
>> + else if (!spin_trylock_irqsave(&barn->lock, flags))
>> + return NULL;
>
> barn_replace_full_sheaf() now returns NULL when the trylock fails, in
> addition to its existing ERR_PTR(-ENOMEM) and ERR_PTR(-E2BIG) returns.
Good catch, cool it can find such bugs.
I'll return ERR_PTR(-EBUSY) which should be compatible with the callers.
© 2016 - 2026 Red Hat, Inc.