[v7] SLUB percpu sheaves

[PATCH v7 04/21] slab: add sheaf support for batching kfree_rcu() operations

Posted by Vlastimil Babka 4 weeks, 1 day ago

Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
For caches with sheaves, on each cpu maintain a rcu_free sheaf in
addition to main and spare sheaves.

kfree_rcu() operations will try to put objects on this sheaf. Once full,
the sheaf is detached and submitted to call_rcu() with a handler that
will try to put it in the barn, or flush to slab pages using bulk free,
when the barn is full. Then a new empty sheaf must be obtained to put
more objects there.

It's possible that no free sheaves are available to use for a new
rcu_free sheaf, and the allocation in kfree_rcu() context can only use
GFP_NOWAIT and thus may fail. In that case, fall back to the existing
kfree_rcu() implementation.

Expected advantages:
- batching the kfree_rcu() operations, that could eventually replace the
  existing batching
- sheaves can be reused for allocations via barn instead of being
  flushed to slabs, which is more efficient
  - this includes cases where only some cpus are allowed to process rcu
    callbacks (Android)

Possible disadvantage:
- objects might be waiting for more than their grace period (it is
  determined by the last object freed into the sheaf), increasing memory
  usage - but the existing batching does that too.

Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
implementation favors smaller memory footprint over performance.

Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
count how many kfree_rcu() used the rcu_free sheaf successfully and how
many had to fall back to the existing implementation.

Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab.h        |   2 +
 mm/slab_common.c |  24 +++++++
 mm/slub.c        | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 216 insertions(+), 2 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index 206987ce44a4d053ebe3b5e50784d2dd23822cd1..f1866f2d9b211bb0d7f24644b80ef4b50a7c3d24 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -435,6 +435,8 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
 	return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
 }
 
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
+
 #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
 			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
 			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e2b197e47866c30acdbd1fee4159f262a751c5a7..2d806e02568532a1000fd3912db6978e945dcfa8 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1608,6 +1608,27 @@ static void kfree_rcu_work(struct work_struct *work)
 		kvfree_rcu_list(head);
 }
 
+static bool kfree_rcu_sheaf(void *obj)
+{
+	struct kmem_cache *s;
+	struct folio *folio;
+	struct slab *slab;
+
+	if (is_vmalloc_addr(obj))
+		return false;
+
+	folio = virt_to_folio(obj);
+	if (unlikely(!folio_test_slab(folio)))
+		return false;
+
+	slab = folio_slab(folio);
+	s = slab->slab_cache;
+	if (s->cpu_sheaves)
+		return __kfree_rcu_sheaf(s, obj);
+
+	return false;
+}
+
 static bool
 need_offload_krc(struct kfree_rcu_cpu *krcp)
 {
@@ -1952,6 +1973,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
 	if (!head)
 		might_sleep();
 
+	if (kfree_rcu_sheaf(ptr))
+		return;
+
 	// Queue the object but don't yet schedule the batch.
 	if (debug_rcu_head_queue(ptr)) {
 		// Probable double kfree_rcu(), just leak.
diff --git a/mm/slub.c b/mm/slub.c
index 42cb5848f1cecb17174967ff8b102b20a50110e3..6a64478befdebdb44cd7896d673bd20a7a6e2889 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -367,6 +367,8 @@ enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
 	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
 	FREE_PCS,		/* Free to percpu sheaf */
+	FREE_RCU_SHEAF,		/* Free to rcu_free sheaf */
+	FREE_RCU_SHEAF_FAIL,	/* Failed to free to a rcu_free sheaf */
 	FREE_FASTPATH,		/* Free to cpu slab */
 	FREE_SLOWPATH,		/* Freeing not to cpu slab */
 	FREE_FROZEN,		/* Freeing to frozen slab */
@@ -461,6 +463,7 @@ struct slab_sheaf {
 		struct rcu_head rcu_head;
 		struct list_head barn_list;
 	};
+	struct kmem_cache *cache;
 	unsigned int size;
 	void *objects[];
 };
@@ -469,6 +472,7 @@ struct slub_percpu_sheaves {
 	local_trylock_t lock;
 	struct slab_sheaf *main; /* never NULL when unlocked */
 	struct slab_sheaf *spare; /* empty or full, may be NULL */
+	struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
 };
 
 /*
@@ -2531,6 +2535,8 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
 	if (unlikely(!sheaf))
 		return NULL;
 
+	sheaf->cache = s;
+
 	stat(s, SHEAF_ALLOC);
 
 	return sheaf;
@@ -2655,6 +2661,43 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
 	sheaf->size = 0;
 }
 
+static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
+				     struct slab_sheaf *sheaf)
+{
+	bool init = slab_want_init_on_free(s);
+	void **p = &sheaf->objects[0];
+	unsigned int i = 0;
+
+	while (i < sheaf->size) {
+		struct slab *slab = virt_to_slab(p[i]);
+
+		memcg_slab_free_hook(s, slab, p + i, 1);
+		alloc_tagging_slab_free_hook(s, slab, p + i, 1);
+
+		if (unlikely(!slab_free_hook(s, p[i], init, true))) {
+			p[i] = p[--sheaf->size];
+			continue;
+		}
+
+		i++;
+	}
+}
+
+static void rcu_free_sheaf_nobarn(struct rcu_head *head)
+{
+	struct slab_sheaf *sheaf;
+	struct kmem_cache *s;
+
+	sheaf = container_of(head, struct slab_sheaf, rcu_head);
+	s = sheaf->cache;
+
+	__rcu_free_sheaf_prepare(s, sheaf);
+
+	sheaf_flush_unused(s, sheaf);
+
+	free_empty_sheaf(s, sheaf);
+}
+
 /*
  * Caller needs to make sure migration is disabled in order to fully flush
  * single cpu's sheaves
@@ -2667,7 +2710,7 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
 static void pcs_flush_all(struct kmem_cache *s)
 {
 	struct slub_percpu_sheaves *pcs;
-	struct slab_sheaf *spare;
+	struct slab_sheaf *spare, *rcu_free;
 
 	local_lock(&s->cpu_sheaves->lock);
 	pcs = this_cpu_ptr(s->cpu_sheaves);
@@ -2675,6 +2718,9 @@ static void pcs_flush_all(struct kmem_cache *s)
 	spare = pcs->spare;
 	pcs->spare = NULL;
 
+	rcu_free = pcs->rcu_free;
+	pcs->rcu_free = NULL;
+
 	local_unlock(&s->cpu_sheaves->lock);
 
 	if (spare) {
@@ -2682,6 +2728,9 @@ static void pcs_flush_all(struct kmem_cache *s)
 		free_empty_sheaf(s, spare);
 	}
 
+	if (rcu_free)
+		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+
 	sheaf_flush_main(s);
 }
 
@@ -2698,6 +2747,11 @@ static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
 		free_empty_sheaf(s, pcs->spare);
 		pcs->spare = NULL;
 	}
+
+	if (pcs->rcu_free) {
+		call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+		pcs->rcu_free = NULL;
+	}
 }
 
 static void pcs_destroy(struct kmem_cache *s)
@@ -2723,6 +2777,7 @@ static void pcs_destroy(struct kmem_cache *s)
 		 */
 
 		WARN_ON(pcs->spare);
+		WARN_ON(pcs->rcu_free);
 
 		if (!WARN_ON(pcs->main->size)) {
 			free_empty_sheaf(s, pcs->main);
@@ -3780,7 +3835,7 @@ static bool has_pcs_used(int cpu, struct kmem_cache *s)
 
 	pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 
-	return (pcs->spare || pcs->main->size);
+	return (pcs->spare || pcs->rcu_free || pcs->main->size);
 }
 
 static void pcs_flush_all(struct kmem_cache *s);
@@ -5415,6 +5470,130 @@ bool free_to_pcs(struct kmem_cache *s, void *object)
 	return true;
 }
 
+static void rcu_free_sheaf(struct rcu_head *head)
+{
+	struct slab_sheaf *sheaf;
+	struct node_barn *barn;
+	struct kmem_cache *s;
+
+	sheaf = container_of(head, struct slab_sheaf, rcu_head);
+
+	s = sheaf->cache;
+
+	/*
+	 * This may remove some objects due to slab_free_hook() returning false,
+	 * so that the sheaf might no longer be completely full. But it's easier
+	 * to handle it as full (unless it became completely empty), as the code
+	 * handles it fine. The only downside is that sheaf will serve fewer
+	 * allocations when reused. It only happens due to debugging, which is a
+	 * performance hit anyway.
+	 */
+	__rcu_free_sheaf_prepare(s, sheaf);
+
+	barn = get_node(s, numa_mem_id())->barn;
+
+	/* due to slab_free_hook() */
+	if (unlikely(sheaf->size == 0))
+		goto empty;
+
+	/*
+	 * Checking nr_full/nr_empty outside lock avoids contention in case the
+	 * barn is at the respective limit. Due to the race we might go over the
+	 * limit but that should be rare and harmless.
+	 */
+
+	if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) {
+		stat(s, BARN_PUT);
+		barn_put_full_sheaf(barn, sheaf);
+		return;
+	}
+
+	stat(s, BARN_PUT_FAIL);
+	sheaf_flush_unused(s, sheaf);
+
+empty:
+	if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
+		barn_put_empty_sheaf(barn, sheaf);
+		return;
+	}
+
+	free_empty_sheaf(s, sheaf);
+}
+
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slab_sheaf *rcu_sheaf;
+
+	if (!local_trylock(&s->cpu_sheaves->lock))
+		goto fail;
+
+	pcs = this_cpu_ptr(s->cpu_sheaves);
+
+	if (unlikely(!pcs->rcu_free)) {
+
+		struct slab_sheaf *empty;
+		struct node_barn *barn;
+
+		if (pcs->spare && pcs->spare->size == 0) {
+			pcs->rcu_free = pcs->spare;
+			pcs->spare = NULL;
+			goto do_free;
+		}
+
+		barn = get_barn(s);
+
+		empty = barn_get_empty_sheaf(barn);
+
+		if (empty) {
+			pcs->rcu_free = empty;
+			goto do_free;
+		}
+
+		local_unlock(&s->cpu_sheaves->lock);
+
+		empty = alloc_empty_sheaf(s, GFP_NOWAIT);
+
+		if (!empty)
+			goto fail;
+
+		if (!local_trylock(&s->cpu_sheaves->lock)) {
+			barn_put_empty_sheaf(barn, empty);
+			goto fail;
+		}
+
+		pcs = this_cpu_ptr(s->cpu_sheaves);
+
+		if (unlikely(pcs->rcu_free))
+			barn_put_empty_sheaf(barn, empty);
+		else
+			pcs->rcu_free = empty;
+	}
+
+do_free:
+
+	rcu_sheaf = pcs->rcu_free;
+
+	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
+
+	if (likely(rcu_sheaf->size < s->sheaf_capacity))
+		rcu_sheaf = NULL;
+	else
+		pcs->rcu_free = NULL;
+
+	local_unlock(&s->cpu_sheaves->lock);
+
+	if (rcu_sheaf)
+		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
+
+	stat(s, FREE_RCU_SHEAF);
+	return true;
+
+fail:
+	stat(s, FREE_RCU_SHEAF_FAIL);
+	return false;
+}
+
 /*
  * Bulk free objects to the percpu sheaves.
  * Unlike free_to_pcs() this includes the calls to all necessary hooks
@@ -6911,6 +7090,11 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 	struct kmem_cache_node *n;
 
 	flush_all_cpus_locked(s);
+
+	/* we might have rcu sheaves in flight */
+	if (s->cpu_sheaves)
+		rcu_barrier();
+
 	/* Attempt to free all objects */
 	for_each_kmem_cache_node(s, node, n) {
 		if (n->barn)
@@ -8286,6 +8470,8 @@ STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf);
 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
 STAT_ATTR(FREE_PCS, free_cpu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail);
 STAT_ATTR(FREE_FASTPATH, free_fastpath);
 STAT_ATTR(FREE_SLOWPATH, free_slowpath);
 STAT_ATTR(FREE_FROZEN, free_frozen);
@@ -8384,6 +8570,8 @@ static struct attribute *slab_attrs[] = {
 	&alloc_fastpath_attr.attr,
 	&alloc_slowpath_attr.attr,
 	&free_cpu_sheaf_attr.attr,
+	&free_rcu_sheaf_attr.attr,
+	&free_rcu_sheaf_fail_attr.attr,
 	&free_fastpath_attr.attr,
 	&free_slowpath_attr.attr,
 	&free_frozen_attr.attr,

-- 
2.51.0

Re: [PATCH v7 04/21] slab: add sheaf support for batching kfree_rcu() operations

Posted by Uladzislau Rezki 3 weeks, 3 days ago

On Wed, Sep 03, 2025 at 02:59:46PM +0200, Vlastimil Babka wrote:
> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> addition to main and spare sheaves.
> 
> kfree_rcu() operations will try to put objects on this sheaf. Once full,
> the sheaf is detached and submitted to call_rcu() with a handler that
> will try to put it in the barn, or flush to slab pages using bulk free,
> when the barn is full. Then a new empty sheaf must be obtained to put
> more objects there.
> 
> It's possible that no free sheaves are available to use for a new
> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> kfree_rcu() implementation.
> 
> Expected advantages:
> - batching the kfree_rcu() operations, that could eventually replace the
>   existing batching
> - sheaves can be reused for allocations via barn instead of being
>   flushed to slabs, which is more efficient
>   - this includes cases where only some cpus are allowed to process rcu
>     callbacks (Android)
> 
> Possible disadvantage:
> - objects might be waiting for more than their grace period (it is
>   determined by the last object freed into the sheaf), increasing memory
>   usage - but the existing batching does that too.
> 
> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> implementation favors smaller memory footprint over performance.
> 
> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> count how many kfree_rcu() used the rcu_free sheaf successfully and how
> many had to fall back to the existing implementation.
> 
> Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/slab.h        |   2 +
>  mm/slab_common.c |  24 +++++++
>  mm/slub.c        | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 216 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/slab.h b/mm/slab.h
> index 206987ce44a4d053ebe3b5e50784d2dd23822cd1..f1866f2d9b211bb0d7f24644b80ef4b50a7c3d24 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -435,6 +435,8 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
>  	return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
>  }
>  
> +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
> +
>  #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
>  			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
>  			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index e2b197e47866c30acdbd1fee4159f262a751c5a7..2d806e02568532a1000fd3912db6978e945dcfa8 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1608,6 +1608,27 @@ static void kfree_rcu_work(struct work_struct *work)
>  		kvfree_rcu_list(head);
>  }
>  
> +static bool kfree_rcu_sheaf(void *obj)
> +{
> +	struct kmem_cache *s;
> +	struct folio *folio;
> +	struct slab *slab;
> +
> +	if (is_vmalloc_addr(obj))
> +		return false;
> +
> +	folio = virt_to_folio(obj);
> +	if (unlikely(!folio_test_slab(folio)))
> +		return false;
> +
> +	slab = folio_slab(folio);
> +	s = slab->slab_cache;
> +	if (s->cpu_sheaves)
> +		return __kfree_rcu_sheaf(s, obj);
> +
> +	return false;
> +}
> +
>  static bool
>  need_offload_krc(struct kfree_rcu_cpu *krcp)
>  {
> @@ -1952,6 +1973,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
>  	if (!head)
>  		might_sleep();
>  
> +	if (kfree_rcu_sheaf(ptr))
> +		return;
> +
Uh.. I have some concerns about this.

This patch introduces a new path which is a collision to the
existing kvfree_rcu() logic. It implements some batching which
we already have.

- kvfree_rcu_barrier() does not know about "sheaf" path. Am i missing
  something? How do you guarantee that kvfree_rcu_barrier() flushes
  sheafs? If it is part of kvfree_rcu() it has to care about this.

- we do not allocate in kvfree_rcu() path because of PREEMMPT_RT, i.e.
  kvfree_rcu() is supposed it can be called from the non-sleeping contexts.
- call_rcu() can be slow, therefore we do not use it in the kvfree_rcu().

IMO, it is worth to reuse existing logic in the kvfree_rcu(). I can help
with it when i have more cycles as part of my RCU work.

--
Uladzislau Rezki

Re: [PATCH v7 04/21] slab: add sheaf support for batching kfree_rcu() operations

Posted by Vlastimil Babka 3 weeks, 3 days ago

On 9/8/25 13:59, Uladzislau Rezki wrote:
> On Wed, Sep 03, 2025 at 02:59:46PM +0200, Vlastimil Babka wrote:
>> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
>> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
>> addition to main and spare sheaves.
>> 
>> kfree_rcu() operations will try to put objects on this sheaf. Once full,
>> the sheaf is detached and submitted to call_rcu() with a handler that
>> will try to put it in the barn, or flush to slab pages using bulk free,
>> when the barn is full. Then a new empty sheaf must be obtained to put
>> more objects there.
>> 
>> It's possible that no free sheaves are available to use for a new
>> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
>> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
>> kfree_rcu() implementation.
>> 
>> Expected advantages:
>> - batching the kfree_rcu() operations, that could eventually replace the
>>   existing batching
>> - sheaves can be reused for allocations via barn instead of being
>>   flushed to slabs, which is more efficient
>>   - this includes cases where only some cpus are allowed to process rcu
>>     callbacks (Android)
>> 
>> Possible disadvantage:
>> - objects might be waiting for more than their grace period (it is
>>   determined by the last object freed into the sheaf), increasing memory
>>   usage - but the existing batching does that too.
>> 
>> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
>> implementation favors smaller memory footprint over performance.
>> 
>> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
>> count how many kfree_rcu() used the rcu_free sheaf successfully and how
>> many had to fall back to the existing implementation.
>> 
>> Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
>> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> ---
>>  mm/slab.h        |   2 +
>>  mm/slab_common.c |  24 +++++++
>>  mm/slub.c        | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  3 files changed, 216 insertions(+), 2 deletions(-)
>> 
>> diff --git a/mm/slab.h b/mm/slab.h
>> index 206987ce44a4d053ebe3b5e50784d2dd23822cd1..f1866f2d9b211bb0d7f24644b80ef4b50a7c3d24 100644
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -435,6 +435,8 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
>>  	return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
>>  }
>>  
>> +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
>> +
>>  #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
>>  			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
>>  			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
>> diff --git a/mm/slab_common.c b/mm/slab_common.c
>> index e2b197e47866c30acdbd1fee4159f262a751c5a7..2d806e02568532a1000fd3912db6978e945dcfa8 100644
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -1608,6 +1608,27 @@ static void kfree_rcu_work(struct work_struct *work)
>>  		kvfree_rcu_list(head);
>>  }
>>  
>> +static bool kfree_rcu_sheaf(void *obj)
>> +{
>> +	struct kmem_cache *s;
>> +	struct folio *folio;
>> +	struct slab *slab;
>> +
>> +	if (is_vmalloc_addr(obj))
>> +		return false;
>> +
>> +	folio = virt_to_folio(obj);
>> +	if (unlikely(!folio_test_slab(folio)))
>> +		return false;
>> +
>> +	slab = folio_slab(folio);
>> +	s = slab->slab_cache;
>> +	if (s->cpu_sheaves)
>> +		return __kfree_rcu_sheaf(s, obj);
>> +
>> +	return false;
>> +}
>> +
>>  static bool
>>  need_offload_krc(struct kfree_rcu_cpu *krcp)
>>  {
>> @@ -1952,6 +1973,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
>>  	if (!head)
>>  		might_sleep();
>>  
>> +	if (kfree_rcu_sheaf(ptr))
>> +		return;
>> +
> Uh.. I have some concerns about this.
> 
> This patch introduces a new path which is a collision to the
> existing kvfree_rcu() logic. It implements some batching which
> we already have.

Yes but for caches with sheaves it's better to recycle the whole sheaf (as
described), which is so different from the existing batching scheme that I'm
not sure if there's a sensible way to combine them.

> - kvfree_rcu_barrier() does not know about "sheaf" path. Am i missing
>   something? How do you guarantee that kvfree_rcu_barrier() flushes
>   sheafs? If it is part of kvfree_rcu() it has to care about this.

Hm good point, thanks. I've taken care of handling flushing related to
kfree_rcu() sheaves in kmem_cache_destroy(), but forgot that
kvfree_rcu_barrier() can be also used outside of that - we have one user in
codetag_unload_module() currently.

> - we do not allocate in kvfree_rcu() path because of PREEMMPT_RT, i.e.
>   kvfree_rcu() is supposed it can be called from the non-sleeping contexts.

Hm I could not find where that distinction is in the code, can you give a
hint please. In __kfree_rcu_sheaf() I do only have a GFP_NOWAIT attempt.

> - call_rcu() can be slow, therefore we do not use it in the kvfree_rcu().

If call_rcu() is called once per 32 kfree_rcu() filling up the rcu sheaf, is
it still too slow?
> IMO, it is worth to reuse existing logic in the kvfree_rcu(). I can help
> with it when i have more cycles as part of my RCU work.
It would be most welcome! I'd suggest we currently proceed with this after I
fix up kvfree_rcu_barrier(), and can attempt to consolidate later.

> --
> Uladzislau Rezki

Re: [PATCH v7 04/21] slab: add sheaf support for batching kfree_rcu() operations

Posted by Uladzislau Rezki 3 weeks, 2 days ago

On Mon, Sep 08, 2025 at 02:45:11PM +0200, Vlastimil Babka wrote:
> On 9/8/25 13:59, Uladzislau Rezki wrote:
> > On Wed, Sep 03, 2025 at 02:59:46PM +0200, Vlastimil Babka wrote:
> >> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> >> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> >> addition to main and spare sheaves.
> >> 
> >> kfree_rcu() operations will try to put objects on this sheaf. Once full,
> >> the sheaf is detached and submitted to call_rcu() with a handler that
> >> will try to put it in the barn, or flush to slab pages using bulk free,
> >> when the barn is full. Then a new empty sheaf must be obtained to put
> >> more objects there.
> >> 
> >> It's possible that no free sheaves are available to use for a new
> >> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> >> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> >> kfree_rcu() implementation.
> >> 
> >> Expected advantages:
> >> - batching the kfree_rcu() operations, that could eventually replace the
> >>   existing batching
> >> - sheaves can be reused for allocations via barn instead of being
> >>   flushed to slabs, which is more efficient
> >>   - this includes cases where only some cpus are allowed to process rcu
> >>     callbacks (Android)
> >> 
> >> Possible disadvantage:
> >> - objects might be waiting for more than their grace period (it is
> >>   determined by the last object freed into the sheaf), increasing memory
> >>   usage - but the existing batching does that too.
> >> 
> >> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> >> implementation favors smaller memory footprint over performance.
> >> 
> >> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> >> count how many kfree_rcu() used the rcu_free sheaf successfully and how
> >> many had to fall back to the existing implementation.
> >> 
> >> Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
> >> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> >> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >> ---
> >>  mm/slab.h        |   2 +
> >>  mm/slab_common.c |  24 +++++++
> >>  mm/slub.c        | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >>  3 files changed, 216 insertions(+), 2 deletions(-)
> >> 
> >> diff --git a/mm/slab.h b/mm/slab.h
> >> index 206987ce44a4d053ebe3b5e50784d2dd23822cd1..f1866f2d9b211bb0d7f24644b80ef4b50a7c3d24 100644
> >> --- a/mm/slab.h
> >> +++ b/mm/slab.h
> >> @@ -435,6 +435,8 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
> >>  	return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
> >>  }
> >>  
> >> +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
> >> +
> >>  #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
> >>  			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
> >>  			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
> >> diff --git a/mm/slab_common.c b/mm/slab_common.c
> >> index e2b197e47866c30acdbd1fee4159f262a751c5a7..2d806e02568532a1000fd3912db6978e945dcfa8 100644
> >> --- a/mm/slab_common.c
> >> +++ b/mm/slab_common.c
> >> @@ -1608,6 +1608,27 @@ static void kfree_rcu_work(struct work_struct *work)
> >>  		kvfree_rcu_list(head);
> >>  }
> >>  
> >> +static bool kfree_rcu_sheaf(void *obj)
> >> +{
> >> +	struct kmem_cache *s;
> >> +	struct folio *folio;
> >> +	struct slab *slab;
> >> +
> >> +	if (is_vmalloc_addr(obj))
> >> +		return false;
> >> +
> >> +	folio = virt_to_folio(obj);
> >> +	if (unlikely(!folio_test_slab(folio)))
> >> +		return false;
> >> +
> >> +	slab = folio_slab(folio);
> >> +	s = slab->slab_cache;
> >> +	if (s->cpu_sheaves)
> >> +		return __kfree_rcu_sheaf(s, obj);
> >> +
> >> +	return false;
> >> +}
> >> +
> >>  static bool
> >>  need_offload_krc(struct kfree_rcu_cpu *krcp)
> >>  {
> >> @@ -1952,6 +1973,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
> >>  	if (!head)
> >>  		might_sleep();
> >>  
> >> +	if (kfree_rcu_sheaf(ptr))
> >> +		return;
> >> +
> > Uh.. I have some concerns about this.
> > 
> > This patch introduces a new path which is a collision to the
> > existing kvfree_rcu() logic. It implements some batching which
> > we already have.
> 
> Yes but for caches with sheaves it's better to recycle the whole sheaf (as
> described), which is so different from the existing batching scheme that I'm
> not sure if there's a sensible way to combine them.
> 
> > - kvfree_rcu_barrier() does not know about "sheaf" path. Am i missing
> >   something? How do you guarantee that kvfree_rcu_barrier() flushes
> >   sheafs? If it is part of kvfree_rcu() it has to care about this.
> 
> Hm good point, thanks. I've taken care of handling flushing related to
> kfree_rcu() sheaves in kmem_cache_destroy(), but forgot that
> kvfree_rcu_barrier() can be also used outside of that - we have one user in
> codetag_unload_module() currently.
> 
> > - we do not allocate in kvfree_rcu() path because of PREEMMPT_RT, i.e.
> >   kvfree_rcu() is supposed it can be called from the non-sleeping contexts.
> 
> Hm I could not find where that distinction is in the code, can you give a
> hint please. In __kfree_rcu_sheaf() I do only have a GFP_NOWAIT attempt.
> 
For PREEMPT_RT a regular spin-lock is an rt-mutex which can sleep. We
made kvfree_rcu() to make it possible to invoke it from non-sleep contexts:

CONFIG_PREEMPT_RT

preempt_disable() or something similar;
 kvfree_rcu();
  GFP_NOWAIT - lock rt-mutex

If GFP_NOWAIT semantic does not access any spin-locks then we are safe
or if it uses raw_spin_locks.

> > - call_rcu() can be slow, therefore we do not use it in the kvfree_rcu().
> 
> If call_rcu() is called once per 32 kfree_rcu() filling up the rcu sheaf, is
> it still too slow?
>
You do not know where in a queue this callback lands, in the beginning,
in the end, etc. It is part of generic list which is processed one by
one. It can contain thousands of callbacks.

If performance is not needed then it is not an issue. But in
kvfree_rcu() we do not use it, because of we want to offload
fast.

--
Uladzislau Rezki

Re: [PATCH v7 04/21] slab: add sheaf support for batching kfree_rcu() operations

Posted by Liam R. Howlett 3 weeks, 2 days ago

* Uladzislau Rezki <urezki@gmail.com> [250909 05:08]:

...

> 
> > > - call_rcu() can be slow, therefore we do not use it in the kvfree_rcu().
> > 
> > If call_rcu() is called once per 32 kfree_rcu() filling up the rcu sheaf, is
> > it still too slow?
> >
> You do not know where in a queue this callback lands, in the beginning,
> in the end, etc. It is part of generic list which is processed one by
> one. It can contain thousands of callbacks.

How does this differ from kvfree_rcu()?

Surely if you have enough calls to kvfree_rcu(), you will end up with a
large list of frees before the end of a grace period?  Our placement in
the freeing order would still be dependent on what else is using the
infrastructure in the same grace period, right?

How is kvfree_rcu() affected by rcu callback offloading to a specific
cpu and rcu expedite?  Often these two features come into play for
certain workloads which are of concern to us.

> 
> If performance is not needed then it is not an issue. But in
> kvfree_rcu() we do not use it, because of we want to offload
> fast.

Today, I free things using call_rcu() and a custom callback so I would
think stacking 32 together would make the list shorter, but latency
would increase by waiting until there are 32.

If we wanted to flush the kvfree_rcu() list, is it done in the same way
as the call_rcu() list, or is there a better way?

Thanks,
Liam

Re: [PATCH v7 04/21] slab: add sheaf support for batching kfree_rcu() operations

Posted by Uladzislau Rezki 3 weeks, 2 days ago

On Tue, Sep 09, 2025 at 10:35:15AM -0400, Liam R. Howlett wrote:
> * Uladzislau Rezki <urezki@gmail.com> [250909 05:08]:
> 
> ...
> 
> > 
> > > > - call_rcu() can be slow, therefore we do not use it in the kvfree_rcu().
> > > 
> > > If call_rcu() is called once per 32 kfree_rcu() filling up the rcu sheaf, is
> > > it still too slow?
> > >
> > You do not know where in a queue this callback lands, in the beginning,
> > in the end, etc. It is part of generic list which is processed one by
> > one. It can contain thousands of callbacks.
> 
> How does this differ from kvfree_rcu()?
> 
> Surely if you have enough calls to kvfree_rcu(), you will end up with a
> large list of frees before the end of a grace period?  Our placement in
> the freeing order would still be dependent on what else is using the
> infrastructure in the same grace period, right?
> 
In kfree_rcu() we use page blocks to carry pointers. Lists can be used
if there is a low memory condition so a page can not be allocated or
cache is empty. But this is not part of carr_rcu() track in any way.

Right regular call_rcu() puts callback into its own internal lists and
they are processed one by one during list iteration. In such lists you
can have hundred of thousand callback.

>
> How is kvfree_rcu() affected by rcu callback offloading to a specific
> cpu and rcu expedite?  Often these two features come into play for
> certain workloads which are of concern to us.
> 
We maintain a separate path. Offload is done after a grace period is
over. It is classic way. Historically all deferred freeing was one
call_rcu() per ptr.

> > 
> > If performance is not needed then it is not an issue. But in
> > kvfree_rcu() we do not use it, because of we want to offload
> > fast.
> 
> Today, I free things using call_rcu() and a custom callback so I would
> think stacking 32 together would make the list shorter, but latency
> would increase by waiting until there are 32.
> 
> If we wanted to flush the kvfree_rcu() list, is it done in the same way
> as the call_rcu() list, or is there a better way?
> 
For this case we have kvfree_rcu_barrier(). It is not same as call_rcu()
flushing.

--
Uladzislau Rezki

Re: [PATCH v7 04/21] slab: add sheaf support for batching kfree_rcu() operations

Posted by Vlastimil Babka 3 weeks, 2 days ago

On 9/9/25 11:08, Uladzislau Rezki wrote:
> On Mon, Sep 08, 2025 at 02:45:11PM +0200, Vlastimil Babka wrote:
>> 
>> Hm I could not find where that distinction is in the code, can you give a
>> hint please. In __kfree_rcu_sheaf() I do only have a GFP_NOWAIT attempt.
>> 
> For PREEMPT_RT a regular spin-lock is an rt-mutex which can sleep. We
> made kvfree_rcu() to make it possible to invoke it from non-sleep contexts:

Oh you mean it's not allocating even on !RT so there's no RT-specific code.

> CONFIG_PREEMPT_RT
> 
> preempt_disable() or something similar;
>  kvfree_rcu();
>   GFP_NOWAIT - lock rt-mutex
> 
> If GFP_NOWAIT semantic does not access any spin-locks then we are safe
> or if it uses raw_spin_locks.

It does access spinlocks so it's not safe. Thanks, I didn't realize that
aspect of kfree_rcu(). We'll need to solve this before making sheaves
enabled everywhere. I don't think the vma or maple tree code would
kfree_rcu() vma or maple_node in such a restricted context. But to be safe
I'll just disable the kfree rcu sheaves for PREEMPT_RT for now.
>> > - call_rcu() can be slow, therefore we do not use it in the kvfree_rcu().
>> 
>> If call_rcu() is called once per 32 kfree_rcu() filling up the rcu sheaf, is
>> it still too slow?
>>
> You do not know where in a queue this callback lands, in the beginning,
> in the end, etc. It is part of generic list which is processed one by
> one. It can contain thousands of callbacks.
> 
> If performance is not needed then it is not an issue. But in
> kvfree_rcu() we do not use it, because of we want to offload
> fast.

So IIUC one could call_rcu(obj, some_callback_accessing_obj) and then do a
kfree_rcu(obj) and the latter can be processed first?
> --
> Uladzislau Rezki

Re: [PATCH v7 04/21] slab: add sheaf support for batching kfree_rcu() operations

Posted by Vlastimil Babka 3 weeks, 2 days ago

On 9/9/25 12:20, Vlastimil Babka wrote:
> On 9/9/25 11:08, Uladzislau Rezki wrote:
>> On Mon, Sep 08, 2025 at 02:45:11PM +0200, Vlastimil Babka wrote:
>>> 
>>> Hm I could not find where that distinction is in the code, can you give a
>>> hint please. In __kfree_rcu_sheaf() I do only have a GFP_NOWAIT attempt.
>>> 
>> For PREEMPT_RT a regular spin-lock is an rt-mutex which can sleep. We
>> made kvfree_rcu() to make it possible to invoke it from non-sleep contexts:
> 
> Oh you mean it's not allocating even on !RT so there's no RT-specific code.
> 
>> CONFIG_PREEMPT_RT
>> 
>> preempt_disable() or something similar;
>>  kvfree_rcu();
>>   GFP_NOWAIT - lock rt-mutex
>> 
>> If GFP_NOWAIT semantic does not access any spin-locks then we are safe
>> or if it uses raw_spin_locks.
> 
> It does access spinlocks so it's not safe. Thanks, I didn't realize that
> aspect of kfree_rcu(). We'll need to solve this before making sheaves
> enabled everywhere. I don't think the vma or maple tree code would
> kfree_rcu() vma or maple_node in such a restricted context. But to be safe
> I'll just disable the kfree rcu sheaves for PREEMPT_RT for now.

So I came up with this fixup to avoid PREEMPT_RT troubles and make
kvfree_rcu_barrier() work.

----8<----
From 15a8db2ef716b5db547f2d86ab30d8774333fb04 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 9 Sep 2025 16:18:52 +0200
Subject: [PATCH] slub: fix issues with kfree_rcu sheaf handling

Fix two issues reported by Ulad:

- on PREEMPT_RT if kfree_rcu() comes from an atomic context, taking a
  spinlock on the barn or doing a GFP_NOWAIT allocation of a new sheaf
  might not be possible. For now just limit the usage of
  kfree_rcu_sheaf() to !PREEMPT_RT

- kvfree_rcu_barrier() must flush all rcu_free sheaves to deliver on its
  promise. The usage isn't limited to destroying of a single cache. Add
  flush_all_rcu_sheaves() to do that.

Reported-by: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab.h        |  1 +
 mm/slab_common.c |  4 ++-
 mm/slub.c        | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/mm/slab.h b/mm/slab.h
index f1866f2d9b21..e82e51c44bd0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -436,6 +436,7 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
 }
 
 bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
+void flush_all_rcu_sheaves(void);
 
 #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
 			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 2d806e025685..005a4319c06a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1973,7 +1973,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
 	if (!head)
 		might_sleep();
 
-	if (kfree_rcu_sheaf(ptr))
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr))
 		return;
 
 	// Queue the object but don't yet schedule the batch.
@@ -2050,6 +2050,8 @@ void kvfree_rcu_barrier(void)
 	bool queued;
 	int i, cpu;
 
+	flush_all_rcu_sheaves();
+
 	/*
 	 * Firstly we detach objects and queue them over an RCU-batch
 	 * for all CPUs. Finally queued works are flushed for each CPU.
diff --git a/mm/slub.c b/mm/slub.c
index 9f9b7e1fa356..19cd8444ae5d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3895,6 +3895,80 @@ static void flush_all(struct kmem_cache *s)
 	cpus_read_unlock();
 }
 
+static void flush_rcu_sheaf(struct work_struct *w)
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slab_sheaf *rcu_free;
+	struct slub_flush_work *sfw;
+	struct kmem_cache *s;
+
+	sfw = container_of(w, struct slub_flush_work, work);
+	s = sfw->s;
+
+	local_lock(&s->cpu_sheaves->lock);
+	pcs = this_cpu_ptr(s->cpu_sheaves);
+
+	rcu_free = pcs->rcu_free;
+	pcs->rcu_free = NULL;
+
+	local_unlock(&s->cpu_sheaves->lock);
+
+	if (rcu_free)
+		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+}
+
+
+/* needed for kvfree_rcu_barrier() */
+void flush_all_rcu_sheaves()
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slub_flush_work *sfw;
+	struct kmem_cache *s;
+	bool flushed = false;
+	unsigned int cpu;
+
+	cpus_read_lock();
+	mutex_lock(&slab_mutex);
+
+	list_for_each_entry(s, &slab_caches, list) {
+		if (!s->cpu_sheaves)
+			continue;
+
+		mutex_lock(&flush_lock);
+
+		for_each_online_cpu(cpu) {
+			sfw = &per_cpu(slub_flush, cpu);
+			pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
+
+			if (!pcs->rcu_free || !pcs->rcu_free->size) {
+				sfw->skip = true;
+				continue;
+			}
+
+			INIT_WORK(&sfw->work, flush_rcu_sheaf);
+			sfw->skip = false;
+			sfw->s = s;
+			queue_work_on(cpu, flushwq, &sfw->work);
+			flushed = true;
+		}
+
+		for_each_online_cpu(cpu) {
+			sfw = &per_cpu(slub_flush, cpu);
+			if (sfw->skip)
+				continue;
+			flush_work(&sfw->work);
+		}
+
+		mutex_unlock(&flush_lock);
+	}
+
+	mutex_unlock(&slab_mutex);
+	cpus_read_unlock();
+
+	if (flushed)
+		rcu_barrier();
+}
+
 /*
  * Use the cpu notifier to insure that the cpu slabs are flushed when
  * necessary.
-- 
2.51.0

Re: [PATCH v7 04/21] slab: add sheaf support for batching kfree_rcu() operations

Posted by Uladzislau Rezki 3 weeks, 2 days ago

On Tue, Sep 09, 2025 at 11:08:20AM +0200, Uladzislau Rezki wrote:
> On Mon, Sep 08, 2025 at 02:45:11PM +0200, Vlastimil Babka wrote:
> > On 9/8/25 13:59, Uladzislau Rezki wrote:
> > > On Wed, Sep 03, 2025 at 02:59:46PM +0200, Vlastimil Babka wrote:
> > >> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> > >> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> > >> addition to main and spare sheaves.
> > >> 
> > >> kfree_rcu() operations will try to put objects on this sheaf. Once full,
> > >> the sheaf is detached and submitted to call_rcu() with a handler that
> > >> will try to put it in the barn, or flush to slab pages using bulk free,
> > >> when the barn is full. Then a new empty sheaf must be obtained to put
> > >> more objects there.
> > >> 
> > >> It's possible that no free sheaves are available to use for a new
> > >> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> > >> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> > >> kfree_rcu() implementation.
> > >> 
> > >> Expected advantages:
> > >> - batching the kfree_rcu() operations, that could eventually replace the
> > >>   existing batching
> > >> - sheaves can be reused for allocations via barn instead of being
> > >>   flushed to slabs, which is more efficient
> > >>   - this includes cases where only some cpus are allowed to process rcu
> > >>     callbacks (Android)
> > >> 
> > >> Possible disadvantage:
> > >> - objects might be waiting for more than their grace period (it is
> > >>   determined by the last object freed into the sheaf), increasing memory
> > >>   usage - but the existing batching does that too.
> > >> 
> > >> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> > >> implementation favors smaller memory footprint over performance.
> > >> 
> > >> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> > >> count how many kfree_rcu() used the rcu_free sheaf successfully and how
> > >> many had to fall back to the existing implementation.
> > >> 
> > >> Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
> > >> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> > >> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> > >> ---
> > >>  mm/slab.h        |   2 +
> > >>  mm/slab_common.c |  24 +++++++
> > >>  mm/slub.c        | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > >>  3 files changed, 216 insertions(+), 2 deletions(-)
> > >> 
> > >> diff --git a/mm/slab.h b/mm/slab.h
> > >> index 206987ce44a4d053ebe3b5e50784d2dd23822cd1..f1866f2d9b211bb0d7f24644b80ef4b50a7c3d24 100644
> > >> --- a/mm/slab.h
> > >> +++ b/mm/slab.h
> > >> @@ -435,6 +435,8 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
> > >>  	return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
> > >>  }
> > >>  
> > >> +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
> > >> +
> > >>  #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
> > >>  			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
> > >>  			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
> > >> diff --git a/mm/slab_common.c b/mm/slab_common.c
> > >> index e2b197e47866c30acdbd1fee4159f262a751c5a7..2d806e02568532a1000fd3912db6978e945dcfa8 100644
> > >> --- a/mm/slab_common.c
> > >> +++ b/mm/slab_common.c
> > >> @@ -1608,6 +1608,27 @@ static void kfree_rcu_work(struct work_struct *work)
> > >>  		kvfree_rcu_list(head);
> > >>  }
> > >>  
> > >> +static bool kfree_rcu_sheaf(void *obj)
> > >> +{
> > >> +	struct kmem_cache *s;
> > >> +	struct folio *folio;
> > >> +	struct slab *slab;
> > >> +
> > >> +	if (is_vmalloc_addr(obj))
> > >> +		return false;
> > >> +
> > >> +	folio = virt_to_folio(obj);
> > >> +	if (unlikely(!folio_test_slab(folio)))
> > >> +		return false;
> > >> +
> > >> +	slab = folio_slab(folio);
> > >> +	s = slab->slab_cache;
> > >> +	if (s->cpu_sheaves)
> > >> +		return __kfree_rcu_sheaf(s, obj);
> > >> +
> > >> +	return false;
> > >> +}
> > >> +
> > >>  static bool
> > >>  need_offload_krc(struct kfree_rcu_cpu *krcp)
> > >>  {
> > >> @@ -1952,6 +1973,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
> > >>  	if (!head)
> > >>  		might_sleep();
> > >>  
> > >> +	if (kfree_rcu_sheaf(ptr))
> > >> +		return;
> > >> +
> > > Uh.. I have some concerns about this.
> > > 
> > > This patch introduces a new path which is a collision to the
> > > existing kvfree_rcu() logic. It implements some batching which
> > > we already have.
> > 
> > Yes but for caches with sheaves it's better to recycle the whole sheaf (as
> > described), which is so different from the existing batching scheme that I'm
> > not sure if there's a sensible way to combine them.
> > 
> > > - kvfree_rcu_barrier() does not know about "sheaf" path. Am i missing
> > >   something? How do you guarantee that kvfree_rcu_barrier() flushes
> > >   sheafs? If it is part of kvfree_rcu() it has to care about this.
> > 
> > Hm good point, thanks. I've taken care of handling flushing related to
> > kfree_rcu() sheaves in kmem_cache_destroy(), but forgot that
> > kvfree_rcu_barrier() can be also used outside of that - we have one user in
> > codetag_unload_module() currently.
> > 
> > > - we do not allocate in kvfree_rcu() path because of PREEMMPT_RT, i.e.
> > >   kvfree_rcu() is supposed it can be called from the non-sleeping contexts.
> > 
> > Hm I could not find where that distinction is in the code, can you give a
> > hint please. In __kfree_rcu_sheaf() I do only have a GFP_NOWAIT attempt.
> > 
> For PREEMPT_RT a regular spin-lock is an rt-mutex which can sleep. We
> made kvfree_rcu() to make it possible to invoke it from non-sleep contexts:
> 
> CONFIG_PREEMPT_RT
> 
> preempt_disable() or something similar;
>  kvfree_rcu();
>   GFP_NOWAIT - lock rt-mutex
> 
> If GFP_NOWAIT semantic does not access any spin-locks then we are safe
> or if it uses raw_spin_locks.
> 
And this is valid only for double argument, single argument you can
invoke from sleeping context only, then you can allocate.

--
Uladzislau Rezki