[PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Vlastimil Babka posted 23 patches 5 months ago
[PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Vlastimil Babka 5 months ago
Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
For caches with sheaves, on each cpu maintain a rcu_free sheaf in
addition to main and spare sheaves.

kfree_rcu() operations will try to put objects on this sheaf. Once full,
the sheaf is detached and submitted to call_rcu() with a handler that
will try to put it in the barn, or flush to slab pages using bulk free,
when the barn is full. Then a new empty sheaf must be obtained to put
more objects there.

It's possible that no free sheaves are available to use for a new
rcu_free sheaf, and the allocation in kfree_rcu() context can only use
GFP_NOWAIT and thus may fail. In that case, fall back to the existing
kfree_rcu() implementation.

Expected advantages:
- batching the kfree_rcu() operations, that could eventually replace the
  existing batching
- sheaves can be reused for allocations via barn instead of being
  flushed to slabs, which is more efficient
  - this includes cases where only some cpus are allowed to process rcu
    callbacks (Android)

Possible disadvantage:
- objects might be waiting for more than their grace period (it is
  determined by the last object freed into the sheaf), increasing memory
  usage - but the existing batching does that too.

Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
implementation favors smaller memory footprint over performance.

Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
contexts where kfree_rcu() is called might not be compatible with taking
a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
spinlock - the current kfree_rcu() implementation avoids doing that.

Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
that have them. This is not a cheap operation, but the barrier usage is
rare - currently kmem_cache_destroy() or on module unload.

Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
count how many kfree_rcu() used the rcu_free sheaf successfully and how
many had to fall back to the existing implementation.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab.h        |   3 +
 mm/slab_common.c |  26 ++++++
 mm/slub.c        | 266 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 293 insertions(+), 2 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index 206987ce44a4d053ebe3b5e50784d2dd23822cd1..e82e51c44bd00042d433ac8b46c2b4bbbdded9b1 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -435,6 +435,9 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
 	return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
 }
 
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
+void flush_all_rcu_sheaves(void);
+
 #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
 			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
 			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e2b197e47866c30acdbd1fee4159f262a751c5a7..005a4319c06a01d2b616a75396fcc43766a62ddb 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1608,6 +1608,27 @@ static void kfree_rcu_work(struct work_struct *work)
 		kvfree_rcu_list(head);
 }
 
+static bool kfree_rcu_sheaf(void *obj)
+{
+	struct kmem_cache *s;
+	struct folio *folio;
+	struct slab *slab;
+
+	if (is_vmalloc_addr(obj))
+		return false;
+
+	folio = virt_to_folio(obj);
+	if (unlikely(!folio_test_slab(folio)))
+		return false;
+
+	slab = folio_slab(folio);
+	s = slab->slab_cache;
+	if (s->cpu_sheaves)
+		return __kfree_rcu_sheaf(s, obj);
+
+	return false;
+}
+
 static bool
 need_offload_krc(struct kfree_rcu_cpu *krcp)
 {
@@ -1952,6 +1973,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
 	if (!head)
 		might_sleep();
 
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr))
+		return;
+
 	// Queue the object but don't yet schedule the batch.
 	if (debug_rcu_head_queue(ptr)) {
 		// Probable double kfree_rcu(), just leak.
@@ -2026,6 +2050,8 @@ void kvfree_rcu_barrier(void)
 	bool queued;
 	int i, cpu;
 
+	flush_all_rcu_sheaves();
+
 	/*
 	 * Firstly we detach objects and queue them over an RCU-batch
 	 * for all CPUs. Finally queued works are flushed for each CPU.
diff --git a/mm/slub.c b/mm/slub.c
index cba188b7e04ddf86debf9bc27a2f725db1b2056e..19cd8444ae5d210c77ae767912ca1ff3fc69c2a8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -367,6 +367,8 @@ enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
 	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
 	FREE_PCS,		/* Free to percpu sheaf */
+	FREE_RCU_SHEAF,		/* Free to rcu_free sheaf */
+	FREE_RCU_SHEAF_FAIL,	/* Failed to free to a rcu_free sheaf */
 	FREE_FASTPATH,		/* Free to cpu slab */
 	FREE_SLOWPATH,		/* Freeing not to cpu slab */
 	FREE_FROZEN,		/* Freeing to frozen slab */
@@ -461,6 +463,7 @@ struct slab_sheaf {
 		struct rcu_head rcu_head;
 		struct list_head barn_list;
 	};
+	struct kmem_cache *cache;
 	unsigned int size;
 	void *objects[];
 };
@@ -469,6 +472,7 @@ struct slub_percpu_sheaves {
 	local_trylock_t lock;
 	struct slab_sheaf *main; /* never NULL when unlocked */
 	struct slab_sheaf *spare; /* empty or full, may be NULL */
+	struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
 };
 
 /*
@@ -2531,6 +2535,8 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
 	if (unlikely(!sheaf))
 		return NULL;
 
+	sheaf->cache = s;
+
 	stat(s, SHEAF_ALLOC);
 
 	return sheaf;
@@ -2655,6 +2661,43 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
 	sheaf->size = 0;
 }
 
+static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
+				     struct slab_sheaf *sheaf)
+{
+	bool init = slab_want_init_on_free(s);
+	void **p = &sheaf->objects[0];
+	unsigned int i = 0;
+
+	while (i < sheaf->size) {
+		struct slab *slab = virt_to_slab(p[i]);
+
+		memcg_slab_free_hook(s, slab, p + i, 1);
+		alloc_tagging_slab_free_hook(s, slab, p + i, 1);
+
+		if (unlikely(!slab_free_hook(s, p[i], init, true))) {
+			p[i] = p[--sheaf->size];
+			continue;
+		}
+
+		i++;
+	}
+}
+
+static void rcu_free_sheaf_nobarn(struct rcu_head *head)
+{
+	struct slab_sheaf *sheaf;
+	struct kmem_cache *s;
+
+	sheaf = container_of(head, struct slab_sheaf, rcu_head);
+	s = sheaf->cache;
+
+	__rcu_free_sheaf_prepare(s, sheaf);
+
+	sheaf_flush_unused(s, sheaf);
+
+	free_empty_sheaf(s, sheaf);
+}
+
 /*
  * Caller needs to make sure migration is disabled in order to fully flush
  * single cpu's sheaves
@@ -2667,7 +2710,7 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
 static void pcs_flush_all(struct kmem_cache *s)
 {
 	struct slub_percpu_sheaves *pcs;
-	struct slab_sheaf *spare;
+	struct slab_sheaf *spare, *rcu_free;
 
 	local_lock(&s->cpu_sheaves->lock);
 	pcs = this_cpu_ptr(s->cpu_sheaves);
@@ -2675,6 +2718,9 @@ static void pcs_flush_all(struct kmem_cache *s)
 	spare = pcs->spare;
 	pcs->spare = NULL;
 
+	rcu_free = pcs->rcu_free;
+	pcs->rcu_free = NULL;
+
 	local_unlock(&s->cpu_sheaves->lock);
 
 	if (spare) {
@@ -2682,6 +2728,9 @@ static void pcs_flush_all(struct kmem_cache *s)
 		free_empty_sheaf(s, spare);
 	}
 
+	if (rcu_free)
+		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+
 	sheaf_flush_main(s);
 }
 
@@ -2698,6 +2747,11 @@ static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
 		free_empty_sheaf(s, pcs->spare);
 		pcs->spare = NULL;
 	}
+
+	if (pcs->rcu_free) {
+		call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+		pcs->rcu_free = NULL;
+	}
 }
 
 static void pcs_destroy(struct kmem_cache *s)
@@ -2723,6 +2777,7 @@ static void pcs_destroy(struct kmem_cache *s)
 		 */
 
 		WARN_ON(pcs->spare);
+		WARN_ON(pcs->rcu_free);
 
 		if (!WARN_ON(pcs->main->size)) {
 			free_empty_sheaf(s, pcs->main);
@@ -3780,7 +3835,7 @@ static bool has_pcs_used(int cpu, struct kmem_cache *s)
 
 	pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 
-	return (pcs->spare || pcs->main->size);
+	return (pcs->spare || pcs->rcu_free || pcs->main->size);
 }
 
 /*
@@ -3840,6 +3895,80 @@ static void flush_all(struct kmem_cache *s)
 	cpus_read_unlock();
 }
 
+static void flush_rcu_sheaf(struct work_struct *w)
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slab_sheaf *rcu_free;
+	struct slub_flush_work *sfw;
+	struct kmem_cache *s;
+
+	sfw = container_of(w, struct slub_flush_work, work);
+	s = sfw->s;
+
+	local_lock(&s->cpu_sheaves->lock);
+	pcs = this_cpu_ptr(s->cpu_sheaves);
+
+	rcu_free = pcs->rcu_free;
+	pcs->rcu_free = NULL;
+
+	local_unlock(&s->cpu_sheaves->lock);
+
+	if (rcu_free)
+		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+}
+
+
+/* needed for kvfree_rcu_barrier() */
+void flush_all_rcu_sheaves()
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slub_flush_work *sfw;
+	struct kmem_cache *s;
+	bool flushed = false;
+	unsigned int cpu;
+
+	cpus_read_lock();
+	mutex_lock(&slab_mutex);
+
+	list_for_each_entry(s, &slab_caches, list) {
+		if (!s->cpu_sheaves)
+			continue;
+
+		mutex_lock(&flush_lock);
+
+		for_each_online_cpu(cpu) {
+			sfw = &per_cpu(slub_flush, cpu);
+			pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
+
+			if (!pcs->rcu_free || !pcs->rcu_free->size) {
+				sfw->skip = true;
+				continue;
+			}
+
+			INIT_WORK(&sfw->work, flush_rcu_sheaf);
+			sfw->skip = false;
+			sfw->s = s;
+			queue_work_on(cpu, flushwq, &sfw->work);
+			flushed = true;
+		}
+
+		for_each_online_cpu(cpu) {
+			sfw = &per_cpu(slub_flush, cpu);
+			if (sfw->skip)
+				continue;
+			flush_work(&sfw->work);
+		}
+
+		mutex_unlock(&flush_lock);
+	}
+
+	mutex_unlock(&slab_mutex);
+	cpus_read_unlock();
+
+	if (flushed)
+		rcu_barrier();
+}
+
 /*
  * Use the cpu notifier to insure that the cpu slabs are flushed when
  * necessary.
@@ -5413,6 +5542,130 @@ bool free_to_pcs(struct kmem_cache *s, void *object)
 	return true;
 }
 
+static void rcu_free_sheaf(struct rcu_head *head)
+{
+	struct slab_sheaf *sheaf;
+	struct node_barn *barn;
+	struct kmem_cache *s;
+
+	sheaf = container_of(head, struct slab_sheaf, rcu_head);
+
+	s = sheaf->cache;
+
+	/*
+	 * This may remove some objects due to slab_free_hook() returning false,
+	 * so that the sheaf might no longer be completely full. But it's easier
+	 * to handle it as full (unless it became completely empty), as the code
+	 * handles it fine. The only downside is that sheaf will serve fewer
+	 * allocations when reused. It only happens due to debugging, which is a
+	 * performance hit anyway.
+	 */
+	__rcu_free_sheaf_prepare(s, sheaf);
+
+	barn = get_node(s, numa_mem_id())->barn;
+
+	/* due to slab_free_hook() */
+	if (unlikely(sheaf->size == 0))
+		goto empty;
+
+	/*
+	 * Checking nr_full/nr_empty outside lock avoids contention in case the
+	 * barn is at the respective limit. Due to the race we might go over the
+	 * limit but that should be rare and harmless.
+	 */
+
+	if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) {
+		stat(s, BARN_PUT);
+		barn_put_full_sheaf(barn, sheaf);
+		return;
+	}
+
+	stat(s, BARN_PUT_FAIL);
+	sheaf_flush_unused(s, sheaf);
+
+empty:
+	if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
+		barn_put_empty_sheaf(barn, sheaf);
+		return;
+	}
+
+	free_empty_sheaf(s, sheaf);
+}
+
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slab_sheaf *rcu_sheaf;
+
+	if (!local_trylock(&s->cpu_sheaves->lock))
+		goto fail;
+
+	pcs = this_cpu_ptr(s->cpu_sheaves);
+
+	if (unlikely(!pcs->rcu_free)) {
+
+		struct slab_sheaf *empty;
+		struct node_barn *barn;
+
+		if (pcs->spare && pcs->spare->size == 0) {
+			pcs->rcu_free = pcs->spare;
+			pcs->spare = NULL;
+			goto do_free;
+		}
+
+		barn = get_barn(s);
+
+		empty = barn_get_empty_sheaf(barn);
+
+		if (empty) {
+			pcs->rcu_free = empty;
+			goto do_free;
+		}
+
+		local_unlock(&s->cpu_sheaves->lock);
+
+		empty = alloc_empty_sheaf(s, GFP_NOWAIT);
+
+		if (!empty)
+			goto fail;
+
+		if (!local_trylock(&s->cpu_sheaves->lock)) {
+			barn_put_empty_sheaf(barn, empty);
+			goto fail;
+		}
+
+		pcs = this_cpu_ptr(s->cpu_sheaves);
+
+		if (unlikely(pcs->rcu_free))
+			barn_put_empty_sheaf(barn, empty);
+		else
+			pcs->rcu_free = empty;
+	}
+
+do_free:
+
+	rcu_sheaf = pcs->rcu_free;
+
+	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
+
+	if (likely(rcu_sheaf->size < s->sheaf_capacity))
+		rcu_sheaf = NULL;
+	else
+		pcs->rcu_free = NULL;
+
+	local_unlock(&s->cpu_sheaves->lock);
+
+	if (rcu_sheaf)
+		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
+
+	stat(s, FREE_RCU_SHEAF);
+	return true;
+
+fail:
+	stat(s, FREE_RCU_SHEAF_FAIL);
+	return false;
+}
+
 /*
  * Bulk free objects to the percpu sheaves.
  * Unlike free_to_pcs() this includes the calls to all necessary hooks
@@ -6909,6 +7162,11 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 	struct kmem_cache_node *n;
 
 	flush_all_cpus_locked(s);
+
+	/* we might have rcu sheaves in flight */
+	if (s->cpu_sheaves)
+		rcu_barrier();
+
 	/* Attempt to free all objects */
 	for_each_kmem_cache_node(s, node, n) {
 		if (n->barn)
@@ -8284,6 +8542,8 @@ STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf);
 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
 STAT_ATTR(FREE_PCS, free_cpu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail);
 STAT_ATTR(FREE_FASTPATH, free_fastpath);
 STAT_ATTR(FREE_SLOWPATH, free_slowpath);
 STAT_ATTR(FREE_FROZEN, free_frozen);
@@ -8382,6 +8642,8 @@ static struct attribute *slab_attrs[] = {
 	&alloc_fastpath_attr.attr,
 	&alloc_slowpath_attr.attr,
 	&free_cpu_sheaf_attr.attr,
+	&free_rcu_sheaf_attr.attr,
+	&free_rcu_sheaf_fail_attr.attr,
 	&free_fastpath_attr.attr,
 	&free_slowpath_attr.attr,
 	&free_frozen_attr.attr,

-- 
2.51.0
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Daniel Gomez 3 months, 1 week ago

On 10/09/2025 10.01, Vlastimil Babka wrote:
> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> addition to main and spare sheaves.
> 
> kfree_rcu() operations will try to put objects on this sheaf. Once full,
> the sheaf is detached and submitted to call_rcu() with a handler that
> will try to put it in the barn, or flush to slab pages using bulk free,
> when the barn is full. Then a new empty sheaf must be obtained to put
> more objects there.
> 
> It's possible that no free sheaves are available to use for a new
> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> kfree_rcu() implementation.
> 
> Expected advantages:
> - batching the kfree_rcu() operations, that could eventually replace the
>   existing batching
> - sheaves can be reused for allocations via barn instead of being
>   flushed to slabs, which is more efficient
>   - this includes cases where only some cpus are allowed to process rcu
>     callbacks (Android)
> 
> Possible disadvantage:
> - objects might be waiting for more than their grace period (it is
>   determined by the last object freed into the sheaf), increasing memory
>   usage - but the existing batching does that too.
> 
> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> implementation favors smaller memory footprint over performance.
> 
> Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
> contexts where kfree_rcu() is called might not be compatible with taking
> a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
> spinlock - the current kfree_rcu() implementation avoids doing that.
> 
> Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
> that have them. This is not a cheap operation, but the barrier usage is
> rare - currently kmem_cache_destroy() or on module unload.
> 
> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> count how many kfree_rcu() used the rcu_free sheaf successfully and how
> many had to fall back to the existing implementation.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Hi Vlastimil,

This patch increases kmod selftest (stress module loader) runtime by about
~50-60%, from ~200s to ~300s total execution time. My tested kernel has
CONFIG_KVFREE_RCU_BATCHED enabled. Any idea or suggestions on what might be
causing this, or how to address it?
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Jon Hunter 2 months, 2 weeks ago

On 31/10/2025 21:32, Daniel Gomez wrote:
> 
> 
> On 10/09/2025 10.01, Vlastimil Babka wrote:
>> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
>> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
>> addition to main and spare sheaves.
>>
>> kfree_rcu() operations will try to put objects on this sheaf. Once full,
>> the sheaf is detached and submitted to call_rcu() with a handler that
>> will try to put it in the barn, or flush to slab pages using bulk free,
>> when the barn is full. Then a new empty sheaf must be obtained to put
>> more objects there.
>>
>> It's possible that no free sheaves are available to use for a new
>> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
>> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
>> kfree_rcu() implementation.
>>
>> Expected advantages:
>> - batching the kfree_rcu() operations, that could eventually replace the
>>    existing batching
>> - sheaves can be reused for allocations via barn instead of being
>>    flushed to slabs, which is more efficient
>>    - this includes cases where only some cpus are allowed to process rcu
>>      callbacks (Android)
>>
>> Possible disadvantage:
>> - objects might be waiting for more than their grace period (it is
>>    determined by the last object freed into the sheaf), increasing memory
>>    usage - but the existing batching does that too.
>>
>> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
>> implementation favors smaller memory footprint over performance.
>>
>> Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
>> contexts where kfree_rcu() is called might not be compatible with taking
>> a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
>> spinlock - the current kfree_rcu() implementation avoids doing that.
>>
>> Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
>> that have them. This is not a cheap operation, but the barrier usage is
>> rare - currently kmem_cache_destroy() or on module unload.
>>
>> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
>> count how many kfree_rcu() used the rcu_free sheaf successfully and how
>> many had to fall back to the existing implementation.
>>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> 
> Hi Vlastimil,
> 
> This patch increases kmod selftest (stress module loader) runtime by about
> ~50-60%, from ~200s to ~300s total execution time. My tested kernel has
> CONFIG_KVFREE_RCU_BATCHED enabled. Any idea or suggestions on what might be
> causing this, or how to address it?
> 

I have been looking into a regression for Linux v6.18-rc where time 
taken to run some internal graphics tests on our Tegra234 device has 
increased from around 35% causing the tests to timeout. Bisect is 
pointing to this commit and I also see we have CONFIG_KVFREE_RCU_BATCHED=y.

I have not tried disabling CONFIG_KVFREE_RCU_BATCHED=y but I can. I am 
not sure if there are any downsides to disabling this?

Thanks
Jon

-- 
nvpublic
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Vlastimil Babka 2 months, 2 weeks ago
On 11/27/25 12:38, Jon Hunter wrote:
> 
> 
> On 31/10/2025 21:32, Daniel Gomez wrote:
>> 
>> 
>> On 10/09/2025 10.01, Vlastimil Babka wrote:
>> 
>> Hi Vlastimil,
>> 
>> This patch increases kmod selftest (stress module loader) runtime by about
>> ~50-60%, from ~200s to ~300s total execution time. My tested kernel has
>> CONFIG_KVFREE_RCU_BATCHED enabled. Any idea or suggestions on what might be
>> causing this, or how to address it?
>> 
> 
> I have been looking into a regression for Linux v6.18-rc where time 
> taken to run some internal graphics tests on our Tegra234 device has 
> increased from around 35% causing the tests to timeout. Bisect is 
> pointing to this commit and I also see we have CONFIG_KVFREE_RCU_BATCHED=y.

Do the tegra tests involve (frequent) module unloads too, then? Or calling
kmem_cache_destroy() somewhere?

Thanks,
Vlastimil

> I have not tried disabling CONFIG_KVFREE_RCU_BATCHED=y but I can. I am 
> not sure if there are any downsides to disabling this?
> 
> Thanks
> Jon
>
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Jon Hunter 2 months, 1 week ago
On 27/11/2025 13:18, Vlastimil Babka wrote:
> On 11/27/25 12:38, Jon Hunter wrote:
>>
>>
>> On 31/10/2025 21:32, Daniel Gomez wrote:
>>>
>>>
>>> On 10/09/2025 10.01, Vlastimil Babka wrote:
>>>
>>> Hi Vlastimil,
>>>
>>> This patch increases kmod selftest (stress module loader) runtime by about
>>> ~50-60%, from ~200s to ~300s total execution time. My tested kernel has
>>> CONFIG_KVFREE_RCU_BATCHED enabled. Any idea or suggestions on what might be
>>> causing this, or how to address it?
>>>
>>
>> I have been looking into a regression for Linux v6.18-rc where time
>> taken to run some internal graphics tests on our Tegra234 device has
>> increased from around 35% causing the tests to timeout. Bisect is
>> pointing to this commit and I also see we have CONFIG_KVFREE_RCU_BATCHED=y.
> 
> Do the tegra tests involve (frequent) module unloads too, then? Or calling
> kmem_cache_destroy() somewhere?

In this specific case I am not running the tegra-tests but we have a 
internal testsuite of GPU related tests. I don't believe that believe 
this is unloading any modules. I can take a look next week to see if 
kmem_cache_destroy() is getting called somewhere when these tests run.

Thanks
Jon

-- 
nvpublic
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Harry Yoo 2 months, 2 weeks ago
On Thu, Nov 27, 2025 at 11:38:49AM +0000, Jon Hunter wrote:
> 
> 
> On 31/10/2025 21:32, Daniel Gomez wrote:
> > 
> > 
> > On 10/09/2025 10.01, Vlastimil Babka wrote:
> > > Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> > > For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> > > addition to main and spare sheaves.
> > > 
> > > kfree_rcu() operations will try to put objects on this sheaf. Once full,
> > > the sheaf is detached and submitted to call_rcu() with a handler that
> > > will try to put it in the barn, or flush to slab pages using bulk free,
> > > when the barn is full. Then a new empty sheaf must be obtained to put
> > > more objects there.
> > > 
> > > It's possible that no free sheaves are available to use for a new
> > > rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> > > GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> > > kfree_rcu() implementation.
> > > 
> > > Expected advantages:
> > > - batching the kfree_rcu() operations, that could eventually replace the
> > >    existing batching
> > > - sheaves can be reused for allocations via barn instead of being
> > >    flushed to slabs, which is more efficient
> > >    - this includes cases where only some cpus are allowed to process rcu
> > >      callbacks (Android)
> > > 
> > > Possible disadvantage:
> > > - objects might be waiting for more than their grace period (it is
> > >    determined by the last object freed into the sheaf), increasing memory
> > >    usage - but the existing batching does that too.
> > > 
> > > Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> > > implementation favors smaller memory footprint over performance.
> > > 
> > > Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
> > > contexts where kfree_rcu() is called might not be compatible with taking
> > > a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
> > > spinlock - the current kfree_rcu() implementation avoids doing that.
> > > 
> > > Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
> > > that have them. This is not a cheap operation, but the barrier usage is
> > > rare - currently kmem_cache_destroy() or on module unload.
> > > 
> > > Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> > > count how many kfree_rcu() used the rcu_free sheaf successfully and how
> > > many had to fall back to the existing implementation.
> > > 
> > > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> > 
> > Hi Vlastimil,
> > 
> > This patch increases kmod selftest (stress module loader) runtime by about
> > ~50-60%, from ~200s to ~300s total execution time. My tested kernel has
> > CONFIG_KVFREE_RCU_BATCHED enabled. Any idea or suggestions on what might be
> > causing this, or how to address it?
> > 
> 
> I have been looking into a regression for Linux v6.18-rc where time taken to
> run some internal graphics tests on our Tegra234 device has increased from
> around 35% causing the tests to timeout. Bisect is pointing to this commit
> and I also see we have CONFIG_KVFREE_RCU_BATCHED=y.

Thanks for reporting! Uh, this has been put aside while I was busy working
on other stuff... but now that we have two people complaining about this,
I'll allocate some time to investigate and improve it.

It'll take some time though :)

> I have not tried disabling CONFIG_KVFREE_RCU_BATCHED=y but I can. I am not
> sure if there are any downsides to disabling this?

I would not recommend doing that, unless you want to sacrifice overall
performance just for the test. Disabling it could create too many RCU
grace periods in the system.

> 
> Thanks
> Jon
> 
> -- 
> nvpublic

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Harry Yoo 2 months, 2 weeks ago
On Thu, Nov 27, 2025 at 09:33:46PM +0900, Harry Yoo wrote:
> On Thu, Nov 27, 2025 at 11:38:49AM +0000, Jon Hunter wrote:
> > 
> > 
> > On 31/10/2025 21:32, Daniel Gomez wrote:
> > > 
> > > 
> > > On 10/09/2025 10.01, Vlastimil Babka wrote:
> > > > Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> > > > For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> > > > addition to main and spare sheaves.
> > > > 
> > > > kfree_rcu() operations will try to put objects on this sheaf. Once full,
> > > > the sheaf is detached and submitted to call_rcu() with a handler that
> > > > will try to put it in the barn, or flush to slab pages using bulk free,
> > > > when the barn is full. Then a new empty sheaf must be obtained to put
> > > > more objects there.
> > > > 
> > > > It's possible that no free sheaves are available to use for a new
> > > > rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> > > > GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> > > > kfree_rcu() implementation.
> > > > 
> > > > Expected advantages:
> > > > - batching the kfree_rcu() operations, that could eventually replace the
> > > >    existing batching
> > > > - sheaves can be reused for allocations via barn instead of being
> > > >    flushed to slabs, which is more efficient
> > > >    - this includes cases where only some cpus are allowed to process rcu
> > > >      callbacks (Android)
> > > > 
> > > > Possible disadvantage:
> > > > - objects might be waiting for more than their grace period (it is
> > > >    determined by the last object freed into the sheaf), increasing memory
> > > >    usage - but the existing batching does that too.
> > > > 
> > > > Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> > > > implementation favors smaller memory footprint over performance.
> > > > 
> > > > Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
> > > > contexts where kfree_rcu() is called might not be compatible with taking
> > > > a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
> > > > spinlock - the current kfree_rcu() implementation avoids doing that.
> > > > 
> > > > Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
> > > > that have them. This is not a cheap operation, but the barrier usage is
> > > > rare - currently kmem_cache_destroy() or on module unload.
> > > > 
> > > > Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> > > > count how many kfree_rcu() used the rcu_free sheaf successfully and how
> > > > many had to fall back to the existing implementation.
> > > > 
> > > > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> > > 
> > > Hi Vlastimil,
> > > 
> > > This patch increases kmod selftest (stress module loader) runtime by about
> > > ~50-60%, from ~200s to ~300s total execution time. My tested kernel has
> > > CONFIG_KVFREE_RCU_BATCHED enabled. Any idea or suggestions on what might be
> > > causing this, or how to address it?
> > > 
> > 
> > I have been looking into a regression for Linux v6.18-rc where time taken to
> > run some internal graphics tests on our Tegra234 device has increased from
> > around 35% causing the tests to timeout. Bisect is pointing to this commit
> > and I also see we have CONFIG_KVFREE_RCU_BATCHED=y.
> 
> Thanks for reporting! Uh, this has been put aside while I was busy working
> on other stuff... but now that we have two people complaining about this,
> I'll allocate some time to investigate and improve it.
> 
> It'll take some time though :)

By the way, how many CPUs do you have on your system, and does your
kernel have CONFIG_CODE_TAGGING enabled?

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Jon Hunter 2 months, 1 week ago
On 27/11/2025 12:48, Harry Yoo wrote:

...

>>> I have been looking into a regression for Linux v6.18-rc where time taken to
>>> run some internal graphics tests on our Tegra234 device has increased from
>>> around 35% causing the tests to timeout. Bisect is pointing to this commit
>>> and I also see we have CONFIG_KVFREE_RCU_BATCHED=y.
>>
>> Thanks for reporting! Uh, this has been put aside while I was busy working
>> on other stuff... but now that we have two people complaining about this,
>> I'll allocate some time to investigate and improve it.
>>
>> It'll take some time though :)
> 
> By the way, how many CPUs do you have on your system, and does your
> kernel have CONFIG_CODE_TAGGING enabled?

For this device there are 12 CPUs. I don't see CONFIG_CODE_TAGGING enabled.

Thanks
Jon

-- 
nvpublic
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Harry Yoo 2 months, 1 week ago
On Fri, Nov 28, 2025 at 08:57:28AM +0000, Jon Hunter wrote:
> 
> On 27/11/2025 12:48, Harry Yoo wrote:
> 
> ...
> 
> > > > I have been looking into a regression for Linux v6.18-rc where time taken to
> > > > run some internal graphics tests on our Tegra234 device has increased from
> > > > around 35% causing the tests to timeout. Bisect is pointing to this commit
> > > > and I also see we have CONFIG_KVFREE_RCU_BATCHED=y.
> > > 
> > > Thanks for reporting! Uh, this has been put aside while I was busy working
> > > on other stuff... but now that we have two people complaining about this,
> > > I'll allocate some time to investigate and improve it.
> > > 
> > > It'll take some time though :)
> > 
> > By the way, how many CPUs do you have on your system, and does your
> > kernel have CONFIG_CODE_TAGGING enabled?
> 
> For this device there are 12 CPUs. I don't see CONFIG_CODE_TAGGING enabled.

Thanks! Then it's probably due to kmem_cache_destroy().
Please let me know this patch improves your test execution time.

https://lore.kernel.org/linux-mm/20251128113740.90129-1-harry.yoo@oracle.com/

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Jon Hunter 2 months, 2 weeks ago
On 27/11/2025 11:38, Jon Hunter wrote:
> 
> 
> On 31/10/2025 21:32, Daniel Gomez wrote:
>>
>>
>> On 10/09/2025 10.01, Vlastimil Babka wrote:
>>> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
>>> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
>>> addition to main and spare sheaves.
>>>
>>> kfree_rcu() operations will try to put objects on this sheaf. Once full,
>>> the sheaf is detached and submitted to call_rcu() with a handler that
>>> will try to put it in the barn, or flush to slab pages using bulk free,
>>> when the barn is full. Then a new empty sheaf must be obtained to put
>>> more objects there.
>>>
>>> It's possible that no free sheaves are available to use for a new
>>> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
>>> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
>>> kfree_rcu() implementation.
>>>
>>> Expected advantages:
>>> - batching the kfree_rcu() operations, that could eventually replace the
>>>    existing batching
>>> - sheaves can be reused for allocations via barn instead of being
>>>    flushed to slabs, which is more efficient
>>>    - this includes cases where only some cpus are allowed to process rcu
>>>      callbacks (Android)
>>>
>>> Possible disadvantage:
>>> - objects might be waiting for more than their grace period (it is
>>>    determined by the last object freed into the sheaf), increasing 
>>> memory
>>>    usage - but the existing batching does that too.
>>>
>>> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
>>> implementation favors smaller memory footprint over performance.
>>>
>>> Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
>>> contexts where kfree_rcu() is called might not be compatible with taking
>>> a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
>>> spinlock - the current kfree_rcu() implementation avoids doing that.
>>>
>>> Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
>>> that have them. This is not a cheap operation, but the barrier usage is
>>> rare - currently kmem_cache_destroy() or on module unload.
>>>
>>> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
>>> count how many kfree_rcu() used the rcu_free sheaf successfully and how
>>> many had to fall back to the existing implementation.
>>>
>>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>>
>> Hi Vlastimil,
>>
>> This patch increases kmod selftest (stress module loader) runtime by 
>> about
>> ~50-60%, from ~200s to ~300s total execution time. My tested kernel has
>> CONFIG_KVFREE_RCU_BATCHED enabled. Any idea or suggestions on what 
>> might be
>> causing this, or how to address it?
>>
> 
> I have been looking into a regression for Linux v6.18-rc where time 
> taken to run some internal graphics tests on our Tegra234 device has 
> increased from around 35% causing the tests to timeout. Bisect is 

I meant 'increased by around 35%'.

> pointing to this commit and I also see we have CONFIG_KVFREE_RCU_BATCHED=y.
> 
> I have not tried disabling CONFIG_KVFREE_RCU_BATCHED=y but I can. I am 
> not sure if there are any downsides to disabling this?
> 
> Thanks
> Jon
> 

-- 
nvpublic

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Harry Yoo 3 months, 1 week ago
On Fri, Oct 31, 2025 at 10:32:54PM +0100, Daniel Gomez wrote:
> 
> 
> On 10/09/2025 10.01, Vlastimil Babka wrote:
> > Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> > For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> > addition to main and spare sheaves.
> > 
> > kfree_rcu() operations will try to put objects on this sheaf. Once full,
> > the sheaf is detached and submitted to call_rcu() with a handler that
> > will try to put it in the barn, or flush to slab pages using bulk free,
> > when the barn is full. Then a new empty sheaf must be obtained to put
> > more objects there.
> > 
> > It's possible that no free sheaves are available to use for a new
> > rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> > GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> > kfree_rcu() implementation.
> > 
> > Expected advantages:
> > - batching the kfree_rcu() operations, that could eventually replace the
> >   existing batching
> > - sheaves can be reused for allocations via barn instead of being
> >   flushed to slabs, which is more efficient
> >   - this includes cases where only some cpus are allowed to process rcu
> >     callbacks (Android)
> > 
> > Possible disadvantage:
> > - objects might be waiting for more than their grace period (it is
> >   determined by the last object freed into the sheaf), increasing memory
> >   usage - but the existing batching does that too.
> > 
> > Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> > implementation favors smaller memory footprint over performance.
> > 
> > Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
> > contexts where kfree_rcu() is called might not be compatible with taking
> > a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
> > spinlock - the current kfree_rcu() implementation avoids doing that.
> > 
> > Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
> > that have them. This is not a cheap operation, but the barrier usage is
> > rare - currently kmem_cache_destroy() or on module unload.
> > 
> > Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> > count how many kfree_rcu() used the rcu_free sheaf successfully and how
> > many had to fall back to the existing implementation.
> > 
> > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> 
> Hi Vlastimil,
> 
> This patch increases kmod selftest (stress module loader) runtime by about
> ~50-60%, from ~200s to ~300s total execution time. My tested kernel has
> CONFIG_KVFREE_RCU_BATCHED enabled. Any idea or suggestions on what might be
> causing this, or how to address it?

This is likely due to increased kvfree_rcu_barrier() during module unload.

It currently iterates over all CPUs x slab caches (that enabled sheaves,
there should be only a few now) pair to make sure rcu sheaf is flushed
by the time kvfree_rcu_barrier() returns.

Just being curious, do you have any serious workload that depends on
the performance of module unload?

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Vlastimil Babka 3 months ago
On 11/3/25 04:17, Harry Yoo wrote:
> On Fri, Oct 31, 2025 at 10:32:54PM +0100, Daniel Gomez wrote:
>> 
>> 
>> On 10/09/2025 10.01, Vlastimil Babka wrote:
>> > Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
>> > For caches with sheaves, on each cpu maintain a rcu_free sheaf in
>> > addition to main and spare sheaves.
>> > 
>> > kfree_rcu() operations will try to put objects on this sheaf. Once full,
>> > the sheaf is detached and submitted to call_rcu() with a handler that
>> > will try to put it in the barn, or flush to slab pages using bulk free,
>> > when the barn is full. Then a new empty sheaf must be obtained to put
>> > more objects there.
>> > 
>> > It's possible that no free sheaves are available to use for a new
>> > rcu_free sheaf, and the allocation in kfree_rcu() context can only use
>> > GFP_NOWAIT and thus may fail. In that case, fall back to the existing
>> > kfree_rcu() implementation.
>> > 
>> > Expected advantages:
>> > - batching the kfree_rcu() operations, that could eventually replace the
>> >   existing batching
>> > - sheaves can be reused for allocations via barn instead of being
>> >   flushed to slabs, which is more efficient
>> >   - this includes cases where only some cpus are allowed to process rcu
>> >     callbacks (Android)
>> > 
>> > Possible disadvantage:
>> > - objects might be waiting for more than their grace period (it is
>> >   determined by the last object freed into the sheaf), increasing memory
>> >   usage - but the existing batching does that too.
>> > 
>> > Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
>> > implementation favors smaller memory footprint over performance.
>> > 
>> > Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
>> > contexts where kfree_rcu() is called might not be compatible with taking
>> > a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
>> > spinlock - the current kfree_rcu() implementation avoids doing that.
>> > 
>> > Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
>> > that have them. This is not a cheap operation, but the barrier usage is
>> > rare - currently kmem_cache_destroy() or on module unload.
>> > 
>> > Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
>> > count how many kfree_rcu() used the rcu_free sheaf successfully and how
>> > many had to fall back to the existing implementation.
>> > 
>> > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> 
>> Hi Vlastimil,
>> 
>> This patch increases kmod selftest (stress module loader) runtime by about
>> ~50-60%, from ~200s to ~300s total execution time. My tested kernel has
>> CONFIG_KVFREE_RCU_BATCHED enabled. Any idea or suggestions on what might be
>> causing this, or how to address it?
> 
> This is likely due to increased kvfree_rcu_barrier() during module unload.

Hm so there are actually two possible sources of this. One is that the
module creates some kmem_cache and calls kmem_cache_destroy() on it before
unloading. That does kvfree_rcu_barrier() which iterates all caches via
flush_all_rcu_sheaves(), but in this case it shouldn't need to - we could
have a weaker form of kvfree_rcu_barrier() that only guarantees flushing of
that single cache.

The other source is codetag_unload_module(), and I'm afraid it's this one as
it's hooked to evey module unload. Do you have CONFIG_CODE_TAGGING enabled?
Disabling it should help in this case, if you don't need memory allocation
profiling for that stress test. I think there's some space for improvement -
when compiled in but memalloc profiling never enabled during the uptime,
this could probably be skipped? Suren?

> It currently iterates over all CPUs x slab caches (that enabled sheaves,
> there should be only a few now) pair to make sure rcu sheaf is flushed
> by the time kvfree_rcu_barrier() returns.

Yeah, also it's done under slab_mutex. Is the stress test trying to unload
multiple modules in parallel? That would make things worse, although I'd
expect there's a lot serialization in this area already.

Unfortunately it will get worse with sheaves extended to all caches. We
could probably mark caches once they allocate their first rcu_free sheaf
(should not add visible overhead) and keep skipping those that never did.
> Just being curious, do you have any serious workload that depends on
> the performance of module unload?
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Daniel Gomez 2 months, 2 weeks ago

On 05/11/2025 12.25, Vlastimil Babka wrote:
> On 11/3/25 04:17, Harry Yoo wrote:
>> On Fri, Oct 31, 2025 at 10:32:54PM +0100, Daniel Gomez wrote:
>>>
>>>
>>> On 10/09/2025 10.01, Vlastimil Babka wrote:
>>>> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
>>>> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
>>>> addition to main and spare sheaves.
>>>>
>>>> kfree_rcu() operations will try to put objects on this sheaf. Once full,
>>>> the sheaf is detached and submitted to call_rcu() with a handler that
>>>> will try to put it in the barn, or flush to slab pages using bulk free,
>>>> when the barn is full. Then a new empty sheaf must be obtained to put
>>>> more objects there.
>>>>
>>>> It's possible that no free sheaves are available to use for a new
>>>> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
>>>> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
>>>> kfree_rcu() implementation.
>>>>
>>>> Expected advantages:
>>>> - batching the kfree_rcu() operations, that could eventually replace the
>>>>   existing batching
>>>> - sheaves can be reused for allocations via barn instead of being
>>>>   flushed to slabs, which is more efficient
>>>>   - this includes cases where only some cpus are allowed to process rcu
>>>>     callbacks (Android)
>>>>
>>>> Possible disadvantage:
>>>> - objects might be waiting for more than their grace period (it is
>>>>   determined by the last object freed into the sheaf), increasing memory
>>>>   usage - but the existing batching does that too.
>>>>
>>>> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
>>>> implementation favors smaller memory footprint over performance.
>>>>
>>>> Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
>>>> contexts where kfree_rcu() is called might not be compatible with taking
>>>> a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
>>>> spinlock - the current kfree_rcu() implementation avoids doing that.
>>>>
>>>> Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
>>>> that have them. This is not a cheap operation, but the barrier usage is
>>>> rare - currently kmem_cache_destroy() or on module unload.
>>>>
>>>> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
>>>> count how many kfree_rcu() used the rcu_free sheaf successfully and how
>>>> many had to fall back to the existing implementation.
>>>>
>>>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>>>
>>> Hi Vlastimil,
>>>
>>> This patch increases kmod selftest (stress module loader) runtime by about
>>> ~50-60%, from ~200s to ~300s total execution time. My tested kernel has
>>> CONFIG_KVFREE_RCU_BATCHED enabled. Any idea or suggestions on what might be
>>> causing this, or how to address it?
>>
>> This is likely due to increased kvfree_rcu_barrier() during module unload.
> 
> Hm so there are actually two possible sources of this. One is that the
> module creates some kmem_cache and calls kmem_cache_destroy() on it before
> unloading. That does kvfree_rcu_barrier() which iterates all caches via
> flush_all_rcu_sheaves(), but in this case it shouldn't need to - we could
> have a weaker form of kvfree_rcu_barrier() that only guarantees flushing of
> that single cache.

Thanks for the feedback. And thanks to Jon who has revived this again.

> 
> The other source is codetag_unload_module(), and I'm afraid it's this one as
> it's hooked to evey module unload. Do you have CONFIG_CODE_TAGGING enabled?

Yes, we do have that enabled.

> Disabling it should help in this case, if you don't need memory allocation
> profiling for that stress test. I think there's some space for improvement -
> when compiled in but memalloc profiling never enabled during the uptime,
> this could probably be skipped? Suren?
> 
>> It currently iterates over all CPUs x slab caches (that enabled sheaves,
>> there should be only a few now) pair to make sure rcu sheaf is flushed
>> by the time kvfree_rcu_barrier() returns.
> 
> Yeah, also it's done under slab_mutex. Is the stress test trying to unload
> multiple modules in parallel? That would make things worse, although I'd
> expect there's a lot serialization in this area already.

AFAIK, the kmod stress test does not unload modules in parallel. Module unload
happens one at a time before each test iteration. However, test 0008 and 0009
run 300 total sequential module unloads.

ALL_TESTS="$ALL_TESTS 0008:150:1"
ALL_TESTS="$ALL_TESTS 0009:150:1"

> 
> Unfortunately it will get worse with sheaves extended to all caches. We
> could probably mark caches once they allocate their first rcu_free sheaf
> (should not add visible overhead) and keep skipping those that never did.
>> Just being curious, do you have any serious workload that depends on
>> the performance of module unload?

Can we have a combination of a weaker form of kvfree_rcu_barrier() + tracking?
Happy to test this again if you have a patch or something in mind.

In addition and AFAIK, module unloading is similar to ebpf programs. Ccing bpf
folks in case they have a workload.

But I don't have a particular workload in mind.
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Suren Baghdasaryan 2 months, 1 week ago
On Thu, Nov 27, 2025 at 6:01 AM Daniel Gomez <da.gomez@kernel.org> wrote:
>
>
>
> On 05/11/2025 12.25, Vlastimil Babka wrote:
> > On 11/3/25 04:17, Harry Yoo wrote:
> >> On Fri, Oct 31, 2025 at 10:32:54PM +0100, Daniel Gomez wrote:
> >>>
> >>>
> >>> On 10/09/2025 10.01, Vlastimil Babka wrote:
> >>>> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> >>>> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> >>>> addition to main and spare sheaves.
> >>>>
> >>>> kfree_rcu() operations will try to put objects on this sheaf. Once full,
> >>>> the sheaf is detached and submitted to call_rcu() with a handler that
> >>>> will try to put it in the barn, or flush to slab pages using bulk free,
> >>>> when the barn is full. Then a new empty sheaf must be obtained to put
> >>>> more objects there.
> >>>>
> >>>> It's possible that no free sheaves are available to use for a new
> >>>> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> >>>> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> >>>> kfree_rcu() implementation.
> >>>>
> >>>> Expected advantages:
> >>>> - batching the kfree_rcu() operations, that could eventually replace the
> >>>>   existing batching
> >>>> - sheaves can be reused for allocations via barn instead of being
> >>>>   flushed to slabs, which is more efficient
> >>>>   - this includes cases where only some cpus are allowed to process rcu
> >>>>     callbacks (Android)
> >>>>
> >>>> Possible disadvantage:
> >>>> - objects might be waiting for more than their grace period (it is
> >>>>   determined by the last object freed into the sheaf), increasing memory
> >>>>   usage - but the existing batching does that too.
> >>>>
> >>>> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> >>>> implementation favors smaller memory footprint over performance.
> >>>>
> >>>> Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
> >>>> contexts where kfree_rcu() is called might not be compatible with taking
> >>>> a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
> >>>> spinlock - the current kfree_rcu() implementation avoids doing that.
> >>>>
> >>>> Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
> >>>> that have them. This is not a cheap operation, but the barrier usage is
> >>>> rare - currently kmem_cache_destroy() or on module unload.
> >>>>
> >>>> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> >>>> count how many kfree_rcu() used the rcu_free sheaf successfully and how
> >>>> many had to fall back to the existing implementation.
> >>>>
> >>>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >>>
> >>> Hi Vlastimil,
> >>>
> >>> This patch increases kmod selftest (stress module loader) runtime by about
> >>> ~50-60%, from ~200s to ~300s total execution time. My tested kernel has
> >>> CONFIG_KVFREE_RCU_BATCHED enabled. Any idea or suggestions on what might be
> >>> causing this, or how to address it?
> >>
> >> This is likely due to increased kvfree_rcu_barrier() during module unload.
> >
> > Hm so there are actually two possible sources of this. One is that the
> > module creates some kmem_cache and calls kmem_cache_destroy() on it before
> > unloading. That does kvfree_rcu_barrier() which iterates all caches via
> > flush_all_rcu_sheaves(), but in this case it shouldn't need to - we could
> > have a weaker form of kvfree_rcu_barrier() that only guarantees flushing of
> > that single cache.
>
> Thanks for the feedback. And thanks to Jon who has revived this again.
>
> >
> > The other source is codetag_unload_module(), and I'm afraid it's this one as
> > it's hooked to evey module unload. Do you have CONFIG_CODE_TAGGING enabled?
>
> Yes, we do have that enabled.

Sorry I missed this discussion before.
IIUC, the performance is impacted because kvfree_rcu_barrier() has to
flush_all_rcu_sheaves(), therefore is more costly than before.

>
> > Disabling it should help in this case, if you don't need memory allocation
> > profiling for that stress test. I think there's some space for improvement -
> > when compiled in but memalloc profiling never enabled during the uptime,
> > this could probably be skipped? Suren?

I think yes, we should be able to skip kvfree_rcu_barrier() inside
codetag_unload_module() if profiling was not enabled.
kvfree_rcu_barrier() is there to ensure all potential kfree_rcu()'s
for module allocations are finished before destroying the tags. I'll
need to add an additional "sticky" flag to record that profiling was
used so that we detect a case when it was enabled, then disabled
before module unloading. I can work on it next week.

> >
> >> It currently iterates over all CPUs x slab caches (that enabled sheaves,
> >> there should be only a few now) pair to make sure rcu sheaf is flushed
> >> by the time kvfree_rcu_barrier() returns.
> >
> > Yeah, also it's done under slab_mutex. Is the stress test trying to unload
> > multiple modules in parallel? That would make things worse, although I'd
> > expect there's a lot serialization in this area already.
>
> AFAIK, the kmod stress test does not unload modules in parallel. Module unload
> happens one at a time before each test iteration. However, test 0008 and 0009
> run 300 total sequential module unloads.
>
> ALL_TESTS="$ALL_TESTS 0008:150:1"
> ALL_TESTS="$ALL_TESTS 0009:150:1"
>
> >
> > Unfortunately it will get worse with sheaves extended to all caches. We
> > could probably mark caches once they allocate their first rcu_free sheaf
> > (should not add visible overhead) and keep skipping those that never did.
> >> Just being curious, do you have any serious workload that depends on
> >> the performance of module unload?
>
> Can we have a combination of a weaker form of kvfree_rcu_barrier() + tracking?
> Happy to test this again if you have a patch or something in mind.
>
> In addition and AFAIK, module unloading is similar to ebpf programs. Ccing bpf
> folks in case they have a workload.
>
> But I don't have a particular workload in mind.
[PATCH V1] mm/slab: introduce kvfree_rcu_barrier_on_cache() for cache destruction
Posted by Harry Yoo 2 months, 1 week ago
Currently, kvfree_rcu_barrier() flushes RCU sheaves across all slab
caches when a cache is destroyed. This is unnecessary when destroying
a slab cache; only the RCU sheaves belonging to the cache being destroyed
need to be flushed.

As suggested by Vlastimil Babka, introduce a weaker form of
kvfree_rcu_barrier() that operates on a specific slab cache and call it
on cache destruction.

The performance benefit is evaluated on a 12 core 24 threads AMD Ryzen
5900X machine (1 socket), by loading slub_kunit module.

Before:
  Total calls: 19
  Average latency (us): 8529
  Total time (us): 162069

After:
  Total calls: 19
  Average latency (us): 3804
  Total time (us): 72287

Link: https://lore.kernel.org/linux-mm/0406562e-2066-4cf8-9902-b2b0616dd742@kernel.org
Link: https://lore.kernel.org/linux-mm/e988eff6-1287-425e-a06c-805af5bbf262@nvidia.com
Link: https://lore.kernel.org/linux-mm/1bda09da-93be-4737-aef0-d47f8c5c9301@suse.cz
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
---

Not sure if the regression is worse on the reporters' machines due to
higher core count (or because some cores were busy doing other things,
dunno).

Hopefully this will reduce the time to complete tests,
and Suren could add his patch on top of this ;)

 include/linux/slab.h |  5 ++++
 mm/slab.h            |  1 +
 mm/slab_common.c     | 52 +++++++++++++++++++++++++++++------------
 mm/slub.c            | 55 ++++++++++++++++++++++++--------------------
 4 files changed, 73 insertions(+), 40 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index cf443f064a66..937c93d44e8c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -1149,6 +1149,10 @@ static inline void kvfree_rcu_barrier(void)
 {
 	rcu_barrier();
 }
+static inline void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
+{
+	rcu_barrier();
+}
 
 static inline void kfree_rcu_scheduler_running(void) { }
 #else
@@ -1156,6 +1160,7 @@ void kvfree_rcu_barrier(void);
 
 void kfree_rcu_scheduler_running(void);
 #endif
+void kvfree_rcu_barrier_on_cache(struct kmem_cache *s);
 
 /**
  * kmalloc_size_roundup - Report allocation bucket size for the given size
diff --git a/mm/slab.h b/mm/slab.h
index f730e012553c..e767aa7e91b0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -422,6 +422,7 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
 
 bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
 void flush_all_rcu_sheaves(void);
+void flush_rcu_sheaves_on_cache(struct kmem_cache *s);
 
 #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
 			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 84dfff4f7b1f..dd8a49d6f9cc 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -492,7 +492,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 		return;
 
 	/* in-flight kfree_rcu()'s may include objects from our cache */
-	kvfree_rcu_barrier();
+	kvfree_rcu_barrier_on_cache(s);
 
 	if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
 	    (s->flags & SLAB_TYPESAFE_BY_RCU)) {
@@ -2038,25 +2038,13 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
 }
 EXPORT_SYMBOL_GPL(kvfree_call_rcu);
 
-/**
- * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
- *
- * Note that a single argument of kvfree_rcu() call has a slow path that
- * triggers synchronize_rcu() following by freeing a pointer. It is done
- * before the return from the function. Therefore for any single-argument
- * call that will result in a kfree() to a cache that is to be destroyed
- * during module exit, it is developer's responsibility to ensure that all
- * such calls have returned before the call to kmem_cache_destroy().
- */
-void kvfree_rcu_barrier(void)
+static inline void __kvfree_rcu_barrier(void)
 {
 	struct kfree_rcu_cpu_work *krwp;
 	struct kfree_rcu_cpu *krcp;
 	bool queued;
 	int i, cpu;
 
-	flush_all_rcu_sheaves();
-
 	/*
 	 * Firstly we detach objects and queue them over an RCU-batch
 	 * for all CPUs. Finally queued works are flushed for each CPU.
@@ -2118,8 +2106,43 @@ void kvfree_rcu_barrier(void)
 		}
 	}
 }
+
+/**
+ * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
+ *
+ * Note that a single argument of kvfree_rcu() call has a slow path that
+ * triggers synchronize_rcu() following by freeing a pointer. It is done
+ * before the return from the function. Therefore for any single-argument
+ * call that will result in a kfree() to a cache that is to be destroyed
+ * during module exit, it is developer's responsibility to ensure that all
+ * such calls have returned before the call to kmem_cache_destroy().
+ */
+void kvfree_rcu_barrier(void)
+{
+	flush_all_rcu_sheaves();
+	__kvfree_rcu_barrier();
+}
 EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
 
+/**
+ * kvfree_rcu_barrier_on_cache - Wait for in-flight kvfree_rcu() calls on a
+ *                               specific slab cache.
+ * @s: slab cache to wait for
+ *
+ * See the description of kvfree_rcu_barrier() for details.
+ */
+void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
+{
+	if (s->cpu_sheaves)
+		flush_rcu_sheaves_on_cache(s);
+	/*
+	 * TODO: Introduce a version of __kvfree_rcu_barrier() that works
+	 * on a specific slab cache.
+	 */
+	__kvfree_rcu_barrier();
+}
+EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
+
 static unsigned long
 kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {
@@ -2215,4 +2238,3 @@ void __init kvfree_rcu_init(void)
 }
 
 #endif /* CONFIG_KVFREE_RCU_BATCHED */
-
diff --git a/mm/slub.c b/mm/slub.c
index 785e25a14999..7cec2220712b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4118,42 +4118,47 @@ static void flush_rcu_sheaf(struct work_struct *w)
 
 
 /* needed for kvfree_rcu_barrier() */
-void flush_all_rcu_sheaves(void)
+void flush_rcu_sheaves_on_cache(struct kmem_cache *s)
 {
 	struct slub_flush_work *sfw;
-	struct kmem_cache *s;
 	unsigned int cpu;
 
-	cpus_read_lock();
-	mutex_lock(&slab_mutex);
+	mutex_lock(&flush_lock);
 
-	list_for_each_entry(s, &slab_caches, list) {
-		if (!s->cpu_sheaves)
-			continue;
+	for_each_online_cpu(cpu) {
+		sfw = &per_cpu(slub_flush, cpu);
 
-		mutex_lock(&flush_lock);
+		/*
+		 * we don't check if rcu_free sheaf exists - racing
+		 * __kfree_rcu_sheaf() might have just removed it.
+		 * by executing flush_rcu_sheaf() on the cpu we make
+		 * sure the __kfree_rcu_sheaf() finished its call_rcu()
+		 */
 
-		for_each_online_cpu(cpu) {
-			sfw = &per_cpu(slub_flush, cpu);
+		INIT_WORK(&sfw->work, flush_rcu_sheaf);
+		sfw->s = s;
+		queue_work_on(cpu, flushwq, &sfw->work);
+	}
 
-			/*
-			 * we don't check if rcu_free sheaf exists - racing
-			 * __kfree_rcu_sheaf() might have just removed it.
-			 * by executing flush_rcu_sheaf() on the cpu we make
-			 * sure the __kfree_rcu_sheaf() finished its call_rcu()
-			 */
+	for_each_online_cpu(cpu) {
+		sfw = &per_cpu(slub_flush, cpu);
+		flush_work(&sfw->work);
+	}
 
-			INIT_WORK(&sfw->work, flush_rcu_sheaf);
-			sfw->s = s;
-			queue_work_on(cpu, flushwq, &sfw->work);
-		}
+	mutex_unlock(&flush_lock);
+}
 
-		for_each_online_cpu(cpu) {
-			sfw = &per_cpu(slub_flush, cpu);
-			flush_work(&sfw->work);
-		}
+void flush_all_rcu_sheaves(void)
+{
+	struct kmem_cache *s;
+
+	cpus_read_lock();
+	mutex_lock(&slab_mutex);
 
-		mutex_unlock(&flush_lock);
+	list_for_each_entry(s, &slab_caches, list) {
+		if (!s->cpu_sheaves)
+			continue;
+		flush_rcu_sheaves_on_cache(s);
 	}
 
 	mutex_unlock(&slab_mutex);
-- 
2.43.0
Re: [PATCH V1] mm/slab: introduce kvfree_rcu_barrier_on_cache() for cache destruction
Posted by Jon Hunter 2 months, 1 week ago
On 28/11/2025 11:37, Harry Yoo wrote:
> Currently, kvfree_rcu_barrier() flushes RCU sheaves across all slab
> caches when a cache is destroyed. This is unnecessary when destroying
> a slab cache; only the RCU sheaves belonging to the cache being destroyed
> need to be flushed.
> 
> As suggested by Vlastimil Babka, introduce a weaker form of
> kvfree_rcu_barrier() that operates on a specific slab cache and call it
> on cache destruction.
> 
> The performance benefit is evaluated on a 12 core 24 threads AMD Ryzen
> 5900X machine (1 socket), by loading slub_kunit module.
> 
> Before:
>    Total calls: 19
>    Average latency (us): 8529
>    Total time (us): 162069
> 
> After:
>    Total calls: 19
>    Average latency (us): 3804
>    Total time (us): 72287
> 
> Link: https://lore.kernel.org/linux-mm/0406562e-2066-4cf8-9902-b2b0616dd742@kernel.org
> Link: https://lore.kernel.org/linux-mm/e988eff6-1287-425e-a06c-805af5bbf262@nvidia.com
> Link: https://lore.kernel.org/linux-mm/1bda09da-93be-4737-aef0-d47f8c5c9301@suse.cz
> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> ---

Thanks for the rapid fix. I have been testing this and can confirm that 
this does fix the performance regression I was seeing.

BTW shouldn't we add a 'Fixes:' tag above? I would like to ensure that 
this gets picked up for v6.18 stable.

Otherwise ...

Tested-by: Jon Hunter <jonathanh@nvidia.com>

Thanks!
Jon

-- 
nvpublic
Re: [PATCH V1] mm/slab: introduce kvfree_rcu_barrier_on_cache() for cache destruction
Posted by Harry Yoo 2 months, 1 week ago
On Tue, Dec 02, 2025 at 09:29:17AM +0000, Jon Hunter wrote:
> 
> On 28/11/2025 11:37, Harry Yoo wrote:
> > Currently, kvfree_rcu_barrier() flushes RCU sheaves across all slab
> > caches when a cache is destroyed. This is unnecessary when destroying
> > a slab cache; only the RCU sheaves belonging to the cache being destroyed
> > need to be flushed.
> > 
> > As suggested by Vlastimil Babka, introduce a weaker form of
> > kvfree_rcu_barrier() that operates on a specific slab cache and call it
> > on cache destruction.
> > 
> > The performance benefit is evaluated on a 12 core 24 threads AMD Ryzen
> > 5900X machine (1 socket), by loading slub_kunit module.
> > 
> > Before:
> >    Total calls: 19
> >    Average latency (us): 8529
> >    Total time (us): 162069
> > 
> > After:
> >    Total calls: 19
> >    Average latency (us): 3804
> >    Total time (us): 72287
> > 
> > Link: https://lore.kernel.org/linux-mm/0406562e-2066-4cf8-9902-b2b0616dd742@kernel.org
> > Link: https://lore.kernel.org/linux-mm/e988eff6-1287-425e-a06c-805af5bbf262@nvidia.com
> > Link: https://lore.kernel.org/linux-mm/1bda09da-93be-4737-aef0-d47f8c5c9301@suse.cz
> > Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> > Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> > ---
> 
> Thanks for the rapid fix. I have been testing this and can confirm that this
> does fix the performance regression I was seeing.

Great!

> BTW shouldn't we add a 'Fixes:' tag above? I would like to ensure that this
> gets picked up for v6.18 stable.

Good point, I added Cc: stable and Fixes: tags.
(and your and Daniel's Reported-and-tested-by: tags)

> Otherwise ...
> 
> Tested-by: Jon Hunter <jonathanh@nvidia.com>

Thank you Jon and Daniel a lot for reporting regression and testing the fix!

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH V1] mm/slab: introduce kvfree_rcu_barrier_on_cache() for cache destruction
Posted by Daniel Gomez 2 months, 1 week ago

On 28/11/2025 12.37, Harry Yoo wrote:
> Currently, kvfree_rcu_barrier() flushes RCU sheaves across all slab
> caches when a cache is destroyed. This is unnecessary when destroying
> a slab cache; only the RCU sheaves belonging to the cache being destroyed
> need to be flushed.
> 
> As suggested by Vlastimil Babka, introduce a weaker form of
> kvfree_rcu_barrier() that operates on a specific slab cache and call it
> on cache destruction.
> 
> The performance benefit is evaluated on a 12 core 24 threads AMD Ryzen
> 5900X machine (1 socket), by loading slub_kunit module.
> 
> Before:
>   Total calls: 19
>   Average latency (us): 8529
>   Total time (us): 162069
> 
> After:
>   Total calls: 19
>   Average latency (us): 3804
>   Total time (us): 72287
> 
> Link: https://lore.kernel.org/linux-mm/0406562e-2066-4cf8-9902-b2b0616dd742@kernel.org
> Link: https://lore.kernel.org/linux-mm/e988eff6-1287-425e-a06c-805af5bbf262@nvidia.com
> Link: https://lore.kernel.org/linux-mm/1bda09da-93be-4737-aef0-d47f8c5c9301@suse.cz
> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> ---

Thanks Harry for the patch,

A quick test on a different machine from the one I originally used to report
this shows a decrease from 214s to 100s.

LGTM,

Tested-by: Daniel Gomez <da.gomez@samsung.com>

> 
> Not sure if the regression is worse on the reporters' machines due to
> higher core count (or because some cores were busy doing other things,
> dunno).

FWIW, CI modules run on an 8 core VM. Depending on the host CPU, this made the
absolute number different but equivalent performance degradation.

> 
> Hopefully this will reduce the time to complete tests,
> and Suren could add his patch on top of this ;)
> 
>  include/linux/slab.h |  5 ++++
>  mm/slab.h            |  1 +
>  mm/slab_common.c     | 52 +++++++++++++++++++++++++++++------------
>  mm/slub.c            | 55 ++++++++++++++++++++++++--------------------
>  4 files changed, 73 insertions(+), 40 deletions(-)
Re: [PATCH V1] mm/slab: introduce kvfree_rcu_barrier_on_cache() for cache destruction
Posted by Harry Yoo 2 months, 1 week ago
On Fri, Nov 28, 2025 at 08:37:40PM +0900, Harry Yoo wrote:
> Currently, kvfree_rcu_barrier() flushes RCU sheaves across all slab
> caches when a cache is destroyed. This is unnecessary when destroying
> a slab cache; only the RCU sheaves belonging to the cache being destroyed
> need to be flushed.
> 
> As suggested by Vlastimil Babka, introduce a weaker form of
> kvfree_rcu_barrier() that operates on a specific slab cache and call it
> on cache destruction.
> 
> The performance benefit is evaluated on a 12 core 24 threads AMD Ryzen
> 5900X machine (1 socket), by loading slub_kunit module.
> 
> Before:
>   Total calls: 19
>   Average latency (us): 8529
>   Total time (us): 162069
> 
> After:
>   Total calls: 19
>   Average latency (us): 3804
>   Total time (us): 72287

Ooh, I just realized that I messed up the config and
have only two cores enabled. Will update the numbers after enabling 22 more :)

> Link: https://lore.kernel.org/linux-mm/0406562e-2066-4cf8-9902-b2b0616dd742@kernel.org
> Link: https://lore.kernel.org/linux-mm/e988eff6-1287-425e-a06c-805af5bbf262@nvidia.com
> Link: https://lore.kernel.org/linux-mm/1bda09da-93be-4737-aef0-d47f8c5c9301@suse.cz
> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> ---
> 
> Not sure if the regression is worse on the reporters' machines due to
> higher core count (or because some cores were busy doing other things,
> dunno).
> 
> Hopefully this will reduce the time to complete tests,
> and Suren could add his patch on top of this ;)
> 
>  include/linux/slab.h |  5 ++++
>  mm/slab.h            |  1 +
>  mm/slab_common.c     | 52 +++++++++++++++++++++++++++++------------
>  mm/slub.c            | 55 ++++++++++++++++++++++++--------------------
>  4 files changed, 73 insertions(+), 40 deletions(-)
> 
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index cf443f064a66..937c93d44e8c 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -1149,6 +1149,10 @@ static inline void kvfree_rcu_barrier(void)
>  {
>  	rcu_barrier();
>  }
> +static inline void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
> +{
> +	rcu_barrier();
> +}
>  
>  static inline void kfree_rcu_scheduler_running(void) { }
>  #else
> @@ -1156,6 +1160,7 @@ void kvfree_rcu_barrier(void);
>  
>  void kfree_rcu_scheduler_running(void);
>  #endif
> +void kvfree_rcu_barrier_on_cache(struct kmem_cache *s);
>  
>  /**
>   * kmalloc_size_roundup - Report allocation bucket size for the given size
> diff --git a/mm/slab.h b/mm/slab.h
> index f730e012553c..e767aa7e91b0 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -422,6 +422,7 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
>  
>  bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
>  void flush_all_rcu_sheaves(void);
> +void flush_rcu_sheaves_on_cache(struct kmem_cache *s);
>  
>  #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
>  			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 84dfff4f7b1f..dd8a49d6f9cc 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -492,7 +492,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
>  		return;
>  
>  	/* in-flight kfree_rcu()'s may include objects from our cache */
> -	kvfree_rcu_barrier();
> +	kvfree_rcu_barrier_on_cache(s);
>  
>  	if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
>  	    (s->flags & SLAB_TYPESAFE_BY_RCU)) {
> @@ -2038,25 +2038,13 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
>  }
>  EXPORT_SYMBOL_GPL(kvfree_call_rcu);
>  
> -/**
> - * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
> - *
> - * Note that a single argument of kvfree_rcu() call has a slow path that
> - * triggers synchronize_rcu() following by freeing a pointer. It is done
> - * before the return from the function. Therefore for any single-argument
> - * call that will result in a kfree() to a cache that is to be destroyed
> - * during module exit, it is developer's responsibility to ensure that all
> - * such calls have returned before the call to kmem_cache_destroy().
> - */
> -void kvfree_rcu_barrier(void)
> +static inline void __kvfree_rcu_barrier(void)
>  {
>  	struct kfree_rcu_cpu_work *krwp;
>  	struct kfree_rcu_cpu *krcp;
>  	bool queued;
>  	int i, cpu;
>  
> -	flush_all_rcu_sheaves();
> -
>  	/*
>  	 * Firstly we detach objects and queue them over an RCU-batch
>  	 * for all CPUs. Finally queued works are flushed for each CPU.
> @@ -2118,8 +2106,43 @@ void kvfree_rcu_barrier(void)
>  		}
>  	}
>  }
> +
> +/**
> + * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
> + *
> + * Note that a single argument of kvfree_rcu() call has a slow path that
> + * triggers synchronize_rcu() following by freeing a pointer. It is done
> + * before the return from the function. Therefore for any single-argument
> + * call that will result in a kfree() to a cache that is to be destroyed
> + * during module exit, it is developer's responsibility to ensure that all
> + * such calls have returned before the call to kmem_cache_destroy().
> + */
> +void kvfree_rcu_barrier(void)
> +{
> +	flush_all_rcu_sheaves();
> +	__kvfree_rcu_barrier();
> +}
>  EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
>  
> +/**
> + * kvfree_rcu_barrier_on_cache - Wait for in-flight kvfree_rcu() calls on a
> + *                               specific slab cache.
> + * @s: slab cache to wait for
> + *
> + * See the description of kvfree_rcu_barrier() for details.
> + */
> +void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
> +{
> +	if (s->cpu_sheaves)
> +		flush_rcu_sheaves_on_cache(s);
> +	/*
> +	 * TODO: Introduce a version of __kvfree_rcu_barrier() that works
> +	 * on a specific slab cache.
> +	 */
> +	__kvfree_rcu_barrier();
> +}
> +EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
> +
>  static unsigned long
>  kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
>  {
> @@ -2215,4 +2238,3 @@ void __init kvfree_rcu_init(void)
>  }
>  
>  #endif /* CONFIG_KVFREE_RCU_BATCHED */
> -
> diff --git a/mm/slub.c b/mm/slub.c
> index 785e25a14999..7cec2220712b 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -4118,42 +4118,47 @@ static void flush_rcu_sheaf(struct work_struct *w)
>  
>  
>  /* needed for kvfree_rcu_barrier() */
> -void flush_all_rcu_sheaves(void)
> +void flush_rcu_sheaves_on_cache(struct kmem_cache *s)
>  {
>  	struct slub_flush_work *sfw;
> -	struct kmem_cache *s;
>  	unsigned int cpu;
>  
> -	cpus_read_lock();
> -	mutex_lock(&slab_mutex);
> +	mutex_lock(&flush_lock);
>  
> -	list_for_each_entry(s, &slab_caches, list) {
> -		if (!s->cpu_sheaves)
> -			continue;
> +	for_each_online_cpu(cpu) {
> +		sfw = &per_cpu(slub_flush, cpu);
>  
> -		mutex_lock(&flush_lock);
> +		/*
> +		 * we don't check if rcu_free sheaf exists - racing
> +		 * __kfree_rcu_sheaf() might have just removed it.
> +		 * by executing flush_rcu_sheaf() on the cpu we make
> +		 * sure the __kfree_rcu_sheaf() finished its call_rcu()
> +		 */
>  
> -		for_each_online_cpu(cpu) {
> -			sfw = &per_cpu(slub_flush, cpu);
> +		INIT_WORK(&sfw->work, flush_rcu_sheaf);
> +		sfw->s = s;
> +		queue_work_on(cpu, flushwq, &sfw->work);
> +	}
>  
> -			/*
> -			 * we don't check if rcu_free sheaf exists - racing
> -			 * __kfree_rcu_sheaf() might have just removed it.
> -			 * by executing flush_rcu_sheaf() on the cpu we make
> -			 * sure the __kfree_rcu_sheaf() finished its call_rcu()
> -			 */
> +	for_each_online_cpu(cpu) {
> +		sfw = &per_cpu(slub_flush, cpu);
> +		flush_work(&sfw->work);
> +	}
>  
> -			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> -			sfw->s = s;
> -			queue_work_on(cpu, flushwq, &sfw->work);
> -		}
> +	mutex_unlock(&flush_lock);
> +}
>  
> -		for_each_online_cpu(cpu) {
> -			sfw = &per_cpu(slub_flush, cpu);
> -			flush_work(&sfw->work);
> -		}
> +void flush_all_rcu_sheaves(void)
> +{
> +	struct kmem_cache *s;
> +
> +	cpus_read_lock();
> +	mutex_lock(&slab_mutex);
>  
> -		mutex_unlock(&flush_lock);
> +	list_for_each_entry(s, &slab_caches, list) {
> +		if (!s->cpu_sheaves)
> +			continue;
> +		flush_rcu_sheaves_on_cache(s);
>  	}
>  
>  	mutex_unlock(&slab_mutex);
> -- 
> 2.43.0
> 

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Harry Yoo 4 months, 3 weeks ago
On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> addition to main and spare sheaves.
> 
> kfree_rcu() operations will try to put objects on this sheaf. Once full,
> the sheaf is detached and submitted to call_rcu() with a handler that
> will try to put it in the barn, or flush to slab pages using bulk free,
> when the barn is full. Then a new empty sheaf must be obtained to put
> more objects there.
> 
> It's possible that no free sheaves are available to use for a new
> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> kfree_rcu() implementation.
> 
> Expected advantages:
> - batching the kfree_rcu() operations, that could eventually replace the
>   existing batching
> - sheaves can be reused for allocations via barn instead of being
>   flushed to slabs, which is more efficient
>   - this includes cases where only some cpus are allowed to process rcu
>     callbacks (Android)
> 
> Possible disadvantage:
> - objects might be waiting for more than their grace period (it is
>   determined by the last object freed into the sheaf), increasing memory
>   usage - but the existing batching does that too.
> 
> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> implementation favors smaller memory footprint over performance.
> 
> Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
> contexts where kfree_rcu() is called might not be compatible with taking
> a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
> spinlock - the current kfree_rcu() implementation avoids doing that.
> 
> Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
> that have them. This is not a cheap operation, but the barrier usage is
> rare - currently kmem_cache_destroy() or on module unload.
> 
> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> count how many kfree_rcu() used the rcu_free sheaf successfully and how
> many had to fall back to the existing implementation.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/slab.h        |   3 +
>  mm/slab_common.c |  26 ++++++
>  mm/slub.c        | 266 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 293 insertions(+), 2 deletions(-)
> 
> @@ -3840,6 +3895,80 @@ static void flush_all(struct kmem_cache *s)
>  	cpus_read_unlock();
>  }
>  
> +/* needed for kvfree_rcu_barrier() */
> +void flush_all_rcu_sheaves()
> +{
> +	struct slub_percpu_sheaves *pcs;
> +	struct slub_flush_work *sfw;
> +	struct kmem_cache *s;
> +	bool flushed = false;
> +	unsigned int cpu;
> +
> +	cpus_read_lock();
> +	mutex_lock(&slab_mutex);
> +
> +	list_for_each_entry(s, &slab_caches, list) {
> +		if (!s->cpu_sheaves)
> +			continue;
> +
> +		mutex_lock(&flush_lock);
> +
> +		for_each_online_cpu(cpu) {
> +			sfw = &per_cpu(slub_flush, cpu);
> +			pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
> +
> +			if (!pcs->rcu_free || !pcs->rcu_free->size) {

Is the compiler allowed to compile this to read pcs->rcu_free twice?
Something like:

flush_all_rcu_sheaves()			__kfree_rcu_sheaf()

pcs->rcu_free != NULL
					pcs->rcu_free = NULL
pcs->rcu_free == NULL
/* NULL-pointer-deref */
pcs->rcu_free->size

> +				sfw->skip = true;
> +				continue;
> +			}
>
> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> +			sfw->skip = false;
> +			sfw->s = s;
> +			queue_work_on(cpu, flushwq, &sfw->work);
> +			flushed = true;
> +		}
> +
> +		for_each_online_cpu(cpu) {
> +			sfw = &per_cpu(slub_flush, cpu);
> +			if (sfw->skip)
> +				continue;
> +			flush_work(&sfw->work);
> +		}
> +
> +		mutex_unlock(&flush_lock);
> +	}
> +
> +	mutex_unlock(&slab_mutex);
> +	cpus_read_unlock();
> +
> +	if (flushed)
> +		rcu_barrier();

I think we need to call rcu_barrier() even if flushed == false?

Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
be processed before flush_all_rcu_sheaves() is called, and
in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
so flushed == false but the rcu callback isn't processed yet
by the end of the function?

That sounds like a very unlikely to happen in a realistic scenario,
but still possible...

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Vlastimil Babka 4 months, 3 weeks ago
On 9/17/25 10:30, Harry Yoo wrote:
> On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>> +/* needed for kvfree_rcu_barrier() */
>> +void flush_all_rcu_sheaves()
>> +{
>> +	struct slub_percpu_sheaves *pcs;
>> +	struct slub_flush_work *sfw;
>> +	struct kmem_cache *s;
>> +	bool flushed = false;
>> +	unsigned int cpu;
>> +
>> +	cpus_read_lock();
>> +	mutex_lock(&slab_mutex);
>> +
>> +	list_for_each_entry(s, &slab_caches, list) {
>> +		if (!s->cpu_sheaves)
>> +			continue;
>> +
>> +		mutex_lock(&flush_lock);
>> +
>> +		for_each_online_cpu(cpu) {
>> +			sfw = &per_cpu(slub_flush, cpu);
>> +			pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
>> +
>> +			if (!pcs->rcu_free || !pcs->rcu_free->size) {
> 
> Is the compiler allowed to compile this to read pcs->rcu_free twice?
> Something like:
> 
> flush_all_rcu_sheaves()			__kfree_rcu_sheaf()
> 
> pcs->rcu_free != NULL
> 					pcs->rcu_free = NULL
> pcs->rcu_free == NULL
> /* NULL-pointer-deref */
> pcs->rcu_free->size

Good point, I'll remove the size check and simply pcs->rcu_free non-null
means we flush.

>> +				sfw->skip = true;
>> +				continue;
>> +			}
>>
>> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>> +			sfw->skip = false;
>> +			sfw->s = s;
>> +			queue_work_on(cpu, flushwq, &sfw->work);
>> +			flushed = true;
>> +		}
>> +
>> +		for_each_online_cpu(cpu) {
>> +			sfw = &per_cpu(slub_flush, cpu);
>> +			if (sfw->skip)
>> +				continue;
>> +			flush_work(&sfw->work);
>> +		}
>> +
>> +		mutex_unlock(&flush_lock);
>> +	}
>> +
>> +	mutex_unlock(&slab_mutex);
>> +	cpus_read_unlock();
>> +
>> +	if (flushed)
>> +		rcu_barrier();
> 
> I think we need to call rcu_barrier() even if flushed == false?
> 
> Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> be processed before flush_all_rcu_sheaves() is called, and
> in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> so flushed == false but the rcu callback isn't processed yet
> by the end of the function?
> 
> That sounds like a very unlikely to happen in a realistic scenario,
> but still possible...

Yes also good point, will flush unconditionally.

Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
local_unlock(). So we don't end up seeing a NULL pcs->rcu_free in
flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
but didn't yet do the call_rcu() as it got preempted after local_unlock().

But then rcu_barrier() itself probably won't mean we make sure such cpus
finished the local_locked section, if we didn't queue work on them. So maybe
we need synchronize_rcu()?
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Paul E. McKenney 4 months, 3 weeks ago
On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
> On 9/17/25 10:30, Harry Yoo wrote:
> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> >> +/* needed for kvfree_rcu_barrier() */
> >> +void flush_all_rcu_sheaves()
> >> +{
> >> +	struct slub_percpu_sheaves *pcs;
> >> +	struct slub_flush_work *sfw;
> >> +	struct kmem_cache *s;
> >> +	bool flushed = false;
> >> +	unsigned int cpu;
> >> +
> >> +	cpus_read_lock();
> >> +	mutex_lock(&slab_mutex);
> >> +
> >> +	list_for_each_entry(s, &slab_caches, list) {
> >> +		if (!s->cpu_sheaves)
> >> +			continue;
> >> +
> >> +		mutex_lock(&flush_lock);
> >> +
> >> +		for_each_online_cpu(cpu) {
> >> +			sfw = &per_cpu(slub_flush, cpu);
> >> +			pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
> >> +
> >> +			if (!pcs->rcu_free || !pcs->rcu_free->size) {
> > 
> > Is the compiler allowed to compile this to read pcs->rcu_free twice?
> > Something like:
> > 
> > flush_all_rcu_sheaves()			__kfree_rcu_sheaf()
> > 
> > pcs->rcu_free != NULL
> > 					pcs->rcu_free = NULL
> > pcs->rcu_free == NULL
> > /* NULL-pointer-deref */
> > pcs->rcu_free->size
> 
> Good point, I'll remove the size check and simply pcs->rcu_free non-null
> means we flush.
> 
> >> +				sfw->skip = true;
> >> +				continue;
> >> +			}
> >>
> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> >> +			sfw->skip = false;
> >> +			sfw->s = s;
> >> +			queue_work_on(cpu, flushwq, &sfw->work);
> >> +			flushed = true;
> >> +		}
> >> +
> >> +		for_each_online_cpu(cpu) {
> >> +			sfw = &per_cpu(slub_flush, cpu);
> >> +			if (sfw->skip)
> >> +				continue;
> >> +			flush_work(&sfw->work);
> >> +		}
> >> +
> >> +		mutex_unlock(&flush_lock);
> >> +	}
> >> +
> >> +	mutex_unlock(&slab_mutex);
> >> +	cpus_read_unlock();
> >> +
> >> +	if (flushed)
> >> +		rcu_barrier();
> > 
> > I think we need to call rcu_barrier() even if flushed == false?
> > 
> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> > be processed before flush_all_rcu_sheaves() is called, and
> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> > so flushed == false but the rcu callback isn't processed yet
> > by the end of the function?
> > 
> > That sounds like a very unlikely to happen in a realistic scenario,
> > but still possible...
> 
> Yes also good point, will flush unconditionally.
> 
> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
> local_unlock(). So we don't end up seeing a NULL pcs->rcu_free in
> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
> but didn't yet do the call_rcu() as it got preempted after local_unlock().
> 
> But then rcu_barrier() itself probably won't mean we make sure such cpus
> finished the local_locked section, if we didn't queue work on them. So maybe
> we need synchronize_rcu()?

Do you need both rcu_barrier() and synchronize_rcu(), maybe along with
kvfree_rcu_barrier() as well?  It would not be hard to make such a thing,
using workqueues or some such.  Not sure what the API should look like,
especially should people want other RCU flavors to get into the act
as well.

							Thanx, Paul
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Vlastimil Babka 4 months, 3 weeks ago
On 9/17/25 13:36, Paul E. McKenney wrote:
> On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
>> On 9/17/25 10:30, Harry Yoo wrote:
>> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>> >> +/* needed for kvfree_rcu_barrier() */
>> >> +void flush_all_rcu_sheaves()
>> >> +{
>> >> +	struct slub_percpu_sheaves *pcs;
>> >> +	struct slub_flush_work *sfw;
>> >> +	struct kmem_cache *s;
>> >> +	bool flushed = false;
>> >> +	unsigned int cpu;
>> >> +
>> >> +	cpus_read_lock();
>> >> +	mutex_lock(&slab_mutex);
>> >> +
>> >> +	list_for_each_entry(s, &slab_caches, list) {
>> >> +		if (!s->cpu_sheaves)
>> >> +			continue;
>> >> +
>> >> +		mutex_lock(&flush_lock);
>> >> +
>> >> +		for_each_online_cpu(cpu) {
>> >> +			sfw = &per_cpu(slub_flush, cpu);
>> >> +			pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
>> >> +
>> >> +			if (!pcs->rcu_free || !pcs->rcu_free->size) {
>> > 
>> > Is the compiler allowed to compile this to read pcs->rcu_free twice?
>> > Something like:
>> > 
>> > flush_all_rcu_sheaves()			__kfree_rcu_sheaf()
>> > 
>> > pcs->rcu_free != NULL
>> > 					pcs->rcu_free = NULL
>> > pcs->rcu_free == NULL
>> > /* NULL-pointer-deref */
>> > pcs->rcu_free->size
>> 
>> Good point, I'll remove the size check and simply pcs->rcu_free non-null
>> means we flush.
>> 
>> >> +				sfw->skip = true;
>> >> +				continue;
>> >> +			}
>> >>
>> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>> >> +			sfw->skip = false;
>> >> +			sfw->s = s;
>> >> +			queue_work_on(cpu, flushwq, &sfw->work);
>> >> +			flushed = true;
>> >> +		}
>> >> +
>> >> +		for_each_online_cpu(cpu) {
>> >> +			sfw = &per_cpu(slub_flush, cpu);
>> >> +			if (sfw->skip)
>> >> +				continue;
>> >> +			flush_work(&sfw->work);
>> >> +		}
>> >> +
>> >> +		mutex_unlock(&flush_lock);
>> >> +	}
>> >> +
>> >> +	mutex_unlock(&slab_mutex);
>> >> +	cpus_read_unlock();
>> >> +
>> >> +	if (flushed)
>> >> +		rcu_barrier();
>> > 
>> > I think we need to call rcu_barrier() even if flushed == false?
>> > 
>> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
>> > be processed before flush_all_rcu_sheaves() is called, and
>> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
>> > so flushed == false but the rcu callback isn't processed yet
>> > by the end of the function?
>> > 
>> > That sounds like a very unlikely to happen in a realistic scenario,
>> > but still possible...
>> 
>> Yes also good point, will flush unconditionally.
>> 
>> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
>> local_unlock(). So we don't end up seeing a NULL pcs->rcu_free in
>> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
>> but didn't yet do the call_rcu() as it got preempted after local_unlock().
>> 
>> But then rcu_barrier() itself probably won't mean we make sure such cpus
>> finished the local_locked section, if we didn't queue work on them. So maybe
>> we need synchronize_rcu()?
> 
> Do you need both rcu_barrier() and synchronize_rcu(), maybe along with

We need the local_lock protected sections of __kfree_rcu_sheaf() to be
finished (which might be doing call_rcu(rcu_sheaf) And then the pending
call_rcu(rcu_sheaf) callbacks to be executed.
I think that means both synchronize_rcu() and rcu_barrier(). Possibly a RCU
critical section in __kfree_rcu_sheaf() unless local_lock implies that even
on RT.

> kvfree_rcu_barrier() as well?

This (flush_all_rcu_sheaves()) is for the implementation of
kvfree_rcu_barrier() in a world with rcu_free sheaves. So it's called from
kvfree_rcu_barrier().

> It would not be hard to make such a thing,
> using workqueues or some such.  Not sure what the API should look like,

So there should be no one else calling such an API. There might be new users
of kvfree_rcu_barrier() doing this indirectly in the future.

> especially should people want other RCU flavors to get into the act
> as well.
> 
> 							Thanx, Paul
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Harry Yoo 4 months, 3 weeks ago
On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
> On 9/17/25 10:30, Harry Yoo wrote:
> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> >> +				sfw->skip = true;
> >> +				continue;
> >> +			}
> >>
> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> >> +			sfw->skip = false;
> >> +			sfw->s = s;
> >> +			queue_work_on(cpu, flushwq, &sfw->work);
> >> +			flushed = true;
> >> +		}
> >> +
> >> +		for_each_online_cpu(cpu) {
> >> +			sfw = &per_cpu(slub_flush, cpu);
> >> +			if (sfw->skip)
> >> +				continue;
> >> +			flush_work(&sfw->work);
> >> +		}
> >> +
> >> +		mutex_unlock(&flush_lock);
> >> +	}
> >> +
> >> +	mutex_unlock(&slab_mutex);
> >> +	cpus_read_unlock();
> >> +
> >> +	if (flushed)
> >> +		rcu_barrier();
> > 
> > I think we need to call rcu_barrier() even if flushed == false?
> > 
> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> > be processed before flush_all_rcu_sheaves() is called, and
> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> > so flushed == false but the rcu callback isn't processed yet
> > by the end of the function?
> > 
> > That sounds like a very unlikely to happen in a realistic scenario,
> > but still possible...
> 
> Yes also good point, will flush unconditionally.
> 
> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
> local_unlock(). So we don't end up seeing a NULL pcs->rcu_free in
> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
> but didn't yet do the call_rcu() as it got preempted after local_unlock().

Makes sense to me.

> But then rcu_barrier() itself probably won't mean we make sure such cpus
> finished the local_locked section, if we didn't queue work on them. So maybe
> we need synchronize_rcu()?

Ah, it works because preemption disabled section works as a RCU
read-side critical section?

But then are we allowed to do release the local_lock to allocate empty
sheaves in __kfree_rcu_sheaf()?

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Vlastimil Babka 4 months, 3 weeks ago
On 9/17/25 13:32, Harry Yoo wrote:
> On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
>> On 9/17/25 10:30, Harry Yoo wrote:
>> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>> >> +				sfw->skip = true;
>> >> +				continue;
>> >> +			}
>> >>
>> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>> >> +			sfw->skip = false;
>> >> +			sfw->s = s;
>> >> +			queue_work_on(cpu, flushwq, &sfw->work);
>> >> +			flushed = true;
>> >> +		}
>> >> +
>> >> +		for_each_online_cpu(cpu) {
>> >> +			sfw = &per_cpu(slub_flush, cpu);
>> >> +			if (sfw->skip)
>> >> +				continue;
>> >> +			flush_work(&sfw->work);
>> >> +		}
>> >> +
>> >> +		mutex_unlock(&flush_lock);
>> >> +	}
>> >> +
>> >> +	mutex_unlock(&slab_mutex);
>> >> +	cpus_read_unlock();
>> >> +
>> >> +	if (flushed)
>> >> +		rcu_barrier();
>> > 
>> > I think we need to call rcu_barrier() even if flushed == false?
>> > 
>> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
>> > be processed before flush_all_rcu_sheaves() is called, and
>> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
>> > so flushed == false but the rcu callback isn't processed yet
>> > by the end of the function?
>> > 
>> > That sounds like a very unlikely to happen in a realistic scenario,
>> > but still possible...
>> 
>> Yes also good point, will flush unconditionally.
>> 
>> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
>> local_unlock(). So we don't end up seeing a NULL pcs->rcu_free in
>> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
>> but didn't yet do the call_rcu() as it got preempted after local_unlock().
> 
> Makes sense to me.
> 
>> But then rcu_barrier() itself probably won't mean we make sure such cpus
>> finished the local_locked section, if we didn't queue work on them. So maybe
>> we need synchronize_rcu()?
> 
> Ah, it works because preemption disabled section works as a RCU
> read-side critical section?

AFAIK yes? Or maybe not on RT where local_lock is taking a mutex? So we
should denote the RCU critical section explicitly too?

> But then are we allowed to do release the local_lock to allocate empty
> sheaves in __kfree_rcu_sheaf()?

I think so, as we do that when we found no rcu_sheaf in the first place.
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Harry Yoo 4 months, 3 weeks ago
On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
> On 9/17/25 13:32, Harry Yoo wrote:
> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
> >> On 9/17/25 10:30, Harry Yoo wrote:
> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> >> >> +				sfw->skip = true;
> >> >> +				continue;
> >> >> +			}
> >> >>
> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> >> >> +			sfw->skip = false;
> >> >> +			sfw->s = s;
> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
> >> >> +			flushed = true;
> >> >> +		}
> >> >> +
> >> >> +		for_each_online_cpu(cpu) {
> >> >> +			sfw = &per_cpu(slub_flush, cpu);
> >> >> +			if (sfw->skip)
> >> >> +				continue;
> >> >> +			flush_work(&sfw->work);
> >> >> +		}
> >> >> +
> >> >> +		mutex_unlock(&flush_lock);
> >> >> +	}
> >> >> +
> >> >> +	mutex_unlock(&slab_mutex);
> >> >> +	cpus_read_unlock();
> >> >> +
> >> >> +	if (flushed)
> >> >> +		rcu_barrier();
> >> > 
> >> > I think we need to call rcu_barrier() even if flushed == false?
> >> > 
> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> >> > be processed before flush_all_rcu_sheaves() is called, and
> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> >> > so flushed == false but the rcu callback isn't processed yet
> >> > by the end of the function?
> >> > 
> >> > That sounds like a very unlikely to happen in a realistic scenario,
> >> > but still possible...
> >> 
> >> Yes also good point, will flush unconditionally.
> >> 
> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
> >> local_unlock().
> >>
> >> So we don't end up seeing a NULL pcs->rcu_free in
> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
> > 
> > Makes sense to me.

Wait, I'm confused.

I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
the object X to be freed before kvfree_rcu_barrier() returns?

IOW if flush_all_rcu_sheaves() is called while __kfree_rcu_sheaf(s, X) was
running on another CPU, we don't have to guarantee that
flush_all_rcu_sheaves() returns after the object X is freed?

> >> But then rcu_barrier() itself probably won't mean we make sure such cpus
> >> finished the local_locked section, if we didn't queue work on them. So maybe
> >> we need synchronize_rcu()?

So... we don't need a synchronize_rcu() then?

Or my brain started malfunctioning again :D

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Vlastimil Babka 4 months, 3 weeks ago
On 9/17/25 15:07, Harry Yoo wrote:
> On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
>> On 9/17/25 13:32, Harry Yoo wrote:
>> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
>> >> On 9/17/25 10:30, Harry Yoo wrote:
>> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>> >> >> +				sfw->skip = true;
>> >> >> +				continue;
>> >> >> +			}
>> >> >>
>> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>> >> >> +			sfw->skip = false;
>> >> >> +			sfw->s = s;
>> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
>> >> >> +			flushed = true;
>> >> >> +		}
>> >> >> +
>> >> >> +		for_each_online_cpu(cpu) {
>> >> >> +			sfw = &per_cpu(slub_flush, cpu);
>> >> >> +			if (sfw->skip)
>> >> >> +				continue;
>> >> >> +			flush_work(&sfw->work);
>> >> >> +		}
>> >> >> +
>> >> >> +		mutex_unlock(&flush_lock);
>> >> >> +	}
>> >> >> +
>> >> >> +	mutex_unlock(&slab_mutex);
>> >> >> +	cpus_read_unlock();
>> >> >> +
>> >> >> +	if (flushed)
>> >> >> +		rcu_barrier();
>> >> > 
>> >> > I think we need to call rcu_barrier() even if flushed == false?
>> >> > 
>> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
>> >> > be processed before flush_all_rcu_sheaves() is called, and
>> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
>> >> > so flushed == false but the rcu callback isn't processed yet
>> >> > by the end of the function?
>> >> > 
>> >> > That sounds like a very unlikely to happen in a realistic scenario,
>> >> > but still possible...
>> >> 
>> >> Yes also good point, will flush unconditionally.
>> >> 
>> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
>> >> local_unlock().
>> >>
>> >> So we don't end up seeing a NULL pcs->rcu_free in
>> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
>> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
>> > 
>> > Makes sense to me.
> 
> Wait, I'm confused.
> 
> I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
> only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
> the object X to be freed before kvfree_rcu_barrier() returns?

Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
the rcu_sheaf fully and thus without submitting it to call_rcu(), then
migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
up the rcu_sheaf fully and is about to call_rcu() on it. And since that
sheaf also contains the object X, we should make sure that is flushed.

> IOW if flush_all_rcu_sheaves() is called while __kfree_rcu_sheaf(s, X) was
> running on another CPU, we don't have to guarantee that
> flush_all_rcu_sheaves() returns after the object X is freed?
> 
>> >> But then rcu_barrier() itself probably won't mean we make sure such cpus
>> >> finished the local_locked section, if we didn't queue work on them. So maybe
>> >> we need synchronize_rcu()?
> 
> So... we don't need a synchronize_rcu() then?
> 
> Or my brain started malfunctioning again :D
>
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Harry Yoo 4 months, 3 weeks ago
On Wed, Sep 17, 2025 at 03:21:31PM +0200, Vlastimil Babka wrote:
> On 9/17/25 15:07, Harry Yoo wrote:
> > On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
> >> On 9/17/25 13:32, Harry Yoo wrote:
> >> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
> >> >> On 9/17/25 10:30, Harry Yoo wrote:
> >> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> >> >> >> +				sfw->skip = true;
> >> >> >> +				continue;
> >> >> >> +			}
> >> >> >>
> >> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> >> >> >> +			sfw->skip = false;
> >> >> >> +			sfw->s = s;
> >> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
> >> >> >> +			flushed = true;
> >> >> >> +		}
> >> >> >> +
> >> >> >> +		for_each_online_cpu(cpu) {
> >> >> >> +			sfw = &per_cpu(slub_flush, cpu);
> >> >> >> +			if (sfw->skip)
> >> >> >> +				continue;
> >> >> >> +			flush_work(&sfw->work);
> >> >> >> +		}
> >> >> >> +
> >> >> >> +		mutex_unlock(&flush_lock);
> >> >> >> +	}
> >> >> >> +
> >> >> >> +	mutex_unlock(&slab_mutex);
> >> >> >> +	cpus_read_unlock();
> >> >> >> +
> >> >> >> +	if (flushed)
> >> >> >> +		rcu_barrier();
> >> >> > 
> >> >> > I think we need to call rcu_barrier() even if flushed == false?
> >> >> > 
> >> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> >> >> > be processed before flush_all_rcu_sheaves() is called, and
> >> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> >> >> > so flushed == false but the rcu callback isn't processed yet
> >> >> > by the end of the function?
> >> >> > 
> >> >> > That sounds like a very unlikely to happen in a realistic scenario,
> >> >> > but still possible...
> >> >> 
> >> >> Yes also good point, will flush unconditionally.
> >> >> 
> >> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
> >> >> local_unlock().
> >> >>
> >> >> So we don't end up seeing a NULL pcs->rcu_free in
> >> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
> >> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
> >> > 
> >> > Makes sense to me.
> > 
> > Wait, I'm confused.
> > 
> > I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
> > only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
> > the object X to be freed before kvfree_rcu_barrier() returns?
> 
> Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
> up the rcu_sheaf fully and is about to call_rcu() on it. And since that
> sheaf also contains the object X, we should make sure that is flushed.

I was going to say "but we queue and wait for the flushing work to
complete, so the sheaf containing object X should be flushed?"

But nah, that's true only if we see pcs->rcu_free != NULL in
flush_all_rcu_sheaves().

You are right...

Hmm, maybe it's simpler to fix this by never skipping queueing the work
even when pcs->rcu_sheaf == NULL?

> > IOW if flush_all_rcu_sheaves() is called while __kfree_rcu_sheaf(s, X) was
> > running on another CPU, we don't have to guarantee that
> > flush_all_rcu_sheaves() returns after the object X is freed?
> > 
> >> >> But then rcu_barrier() itself probably won't mean we make sure such cpus
> >> >> finished the local_locked section, if we didn't queue work on them. So maybe
> >> >> we need synchronize_rcu()?
> > 
> > So... we don't need a synchronize_rcu() then?
> > 
> > Or my brain started malfunctioning again :D

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Vlastimil Babka 4 months, 3 weeks ago
On 9/17/25 15:34, Harry Yoo wrote:
> On Wed, Sep 17, 2025 at 03:21:31PM +0200, Vlastimil Babka wrote:
>> On 9/17/25 15:07, Harry Yoo wrote:
>> > On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
>> >> On 9/17/25 13:32, Harry Yoo wrote:
>> >> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
>> >> >> On 9/17/25 10:30, Harry Yoo wrote:
>> >> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>> >> >> >> +				sfw->skip = true;
>> >> >> >> +				continue;
>> >> >> >> +			}
>> >> >> >>
>> >> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>> >> >> >> +			sfw->skip = false;
>> >> >> >> +			sfw->s = s;
>> >> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
>> >> >> >> +			flushed = true;
>> >> >> >> +		}
>> >> >> >> +
>> >> >> >> +		for_each_online_cpu(cpu) {
>> >> >> >> +			sfw = &per_cpu(slub_flush, cpu);
>> >> >> >> +			if (sfw->skip)
>> >> >> >> +				continue;
>> >> >> >> +			flush_work(&sfw->work);
>> >> >> >> +		}
>> >> >> >> +
>> >> >> >> +		mutex_unlock(&flush_lock);
>> >> >> >> +	}
>> >> >> >> +
>> >> >> >> +	mutex_unlock(&slab_mutex);
>> >> >> >> +	cpus_read_unlock();
>> >> >> >> +
>> >> >> >> +	if (flushed)
>> >> >> >> +		rcu_barrier();
>> >> >> > 
>> >> >> > I think we need to call rcu_barrier() even if flushed == false?
>> >> >> > 
>> >> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
>> >> >> > be processed before flush_all_rcu_sheaves() is called, and
>> >> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
>> >> >> > so flushed == false but the rcu callback isn't processed yet
>> >> >> > by the end of the function?
>> >> >> > 
>> >> >> > That sounds like a very unlikely to happen in a realistic scenario,
>> >> >> > but still possible...
>> >> >> 
>> >> >> Yes also good point, will flush unconditionally.
>> >> >> 
>> >> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
>> >> >> local_unlock().
>> >> >>
>> >> >> So we don't end up seeing a NULL pcs->rcu_free in
>> >> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
>> >> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
>> >> > 
>> >> > Makes sense to me.
>> > 
>> > Wait, I'm confused.
>> > 
>> > I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
>> > only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
>> > the object X to be freed before kvfree_rcu_barrier() returns?
>> 
>> Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
>> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
>> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
>> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
>> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
>> up the rcu_sheaf fully and is about to call_rcu() on it. And since that
>> sheaf also contains the object X, we should make sure that is flushed.
> 
> I was going to say "but we queue and wait for the flushing work to
> complete, so the sheaf containing object X should be flushed?"
> 
> But nah, that's true only if we see pcs->rcu_free != NULL in
> flush_all_rcu_sheaves().
> 
> You are right...
> 
> Hmm, maybe it's simpler to fix this by never skipping queueing the work
> even when pcs->rcu_sheaf == NULL?

I guess it's simpler, yeah.
We might have to think of something better once all caches have sheaves,
queueing and waiting for work to finish on each cpu, repeated for each
kmem_cache, might be just too much?

>> > IOW if flush_all_rcu_sheaves() is called while __kfree_rcu_sheaf(s, X) was
>> > running on another CPU, we don't have to guarantee that
>> > flush_all_rcu_sheaves() returns after the object X is freed?
>> > 
>> >> >> But then rcu_barrier() itself probably won't mean we make sure such cpus
>> >> >> finished the local_locked section, if we didn't queue work on them. So maybe
>> >> >> we need synchronize_rcu()?
>> > 
>> > So... we don't need a synchronize_rcu() then?
>> > 
>> > Or my brain started malfunctioning again :D
>
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Vlastimil Babka 4 months, 3 weeks ago
On 9/17/25 16:14, Vlastimil Babka wrote:
> On 9/17/25 15:34, Harry Yoo wrote:
>> On Wed, Sep 17, 2025 at 03:21:31PM +0200, Vlastimil Babka wrote:
>>> On 9/17/25 15:07, Harry Yoo wrote:
>>> > On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
>>> >> On 9/17/25 13:32, Harry Yoo wrote:
>>> >> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
>>> >> >> On 9/17/25 10:30, Harry Yoo wrote:
>>> >> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>>> >> >> >> +				sfw->skip = true;
>>> >> >> >> +				continue;
>>> >> >> >> +			}
>>> >> >> >>
>>> >> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>>> >> >> >> +			sfw->skip = false;
>>> >> >> >> +			sfw->s = s;
>>> >> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
>>> >> >> >> +			flushed = true;
>>> >> >> >> +		}
>>> >> >> >> +
>>> >> >> >> +		for_each_online_cpu(cpu) {
>>> >> >> >> +			sfw = &per_cpu(slub_flush, cpu);
>>> >> >> >> +			if (sfw->skip)
>>> >> >> >> +				continue;
>>> >> >> >> +			flush_work(&sfw->work);
>>> >> >> >> +		}
>>> >> >> >> +
>>> >> >> >> +		mutex_unlock(&flush_lock);
>>> >> >> >> +	}
>>> >> >> >> +
>>> >> >> >> +	mutex_unlock(&slab_mutex);
>>> >> >> >> +	cpus_read_unlock();
>>> >> >> >> +
>>> >> >> >> +	if (flushed)
>>> >> >> >> +		rcu_barrier();
>>> >> >> > 
>>> >> >> > I think we need to call rcu_barrier() even if flushed == false?
>>> >> >> > 
>>> >> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
>>> >> >> > be processed before flush_all_rcu_sheaves() is called, and
>>> >> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
>>> >> >> > so flushed == false but the rcu callback isn't processed yet
>>> >> >> > by the end of the function?
>>> >> >> > 
>>> >> >> > That sounds like a very unlikely to happen in a realistic scenario,
>>> >> >> > but still possible...
>>> >> >> 
>>> >> >> Yes also good point, will flush unconditionally.
>>> >> >> 
>>> >> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
>>> >> >> local_unlock().
>>> >> >>
>>> >> >> So we don't end up seeing a NULL pcs->rcu_free in
>>> >> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
>>> >> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
>>> >> > 
>>> >> > Makes sense to me.
>>> > 
>>> > Wait, I'm confused.
>>> > 
>>> > I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
>>> > only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
>>> > the object X to be freed before kvfree_rcu_barrier() returns?
>>> 
>>> Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
>>> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
>>> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
>>> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
>>> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
>>> up the rcu_sheaf fully and is about to call_rcu() on it. And since that
>>> sheaf also contains the object X, we should make sure that is flushed.
>> 
>> I was going to say "but we queue and wait for the flushing work to
>> complete, so the sheaf containing object X should be flushed?"
>> 
>> But nah, that's true only if we see pcs->rcu_free != NULL in
>> flush_all_rcu_sheaves().
>> 
>> You are right...
>> 
>> Hmm, maybe it's simpler to fix this by never skipping queueing the work
>> even when pcs->rcu_sheaf == NULL?
> 
> I guess it's simpler, yeah.

So what about this? The unconditional queueing should cover all races with
__kfree_rcu_sheaf() so there's just unconditional rcu_barrier() in the end.

From 0722b29fa1625b31c05d659d1d988ec882247b38 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 3 Sep 2025 14:59:46 +0200
Subject: [PATCH] slab: add sheaf support for batching kfree_rcu() operations

Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
For caches with sheaves, on each cpu maintain a rcu_free sheaf in
addition to main and spare sheaves.

kfree_rcu() operations will try to put objects on this sheaf. Once full,
the sheaf is detached and submitted to call_rcu() with a handler that
will try to put it in the barn, or flush to slab pages using bulk free,
when the barn is full. Then a new empty sheaf must be obtained to put
more objects there.

It's possible that no free sheaves are available to use for a new
rcu_free sheaf, and the allocation in kfree_rcu() context can only use
GFP_NOWAIT and thus may fail. In that case, fall back to the existing
kfree_rcu() implementation.

Expected advantages:
- batching the kfree_rcu() operations, that could eventually replace the
  existing batching
- sheaves can be reused for allocations via barn instead of being
  flushed to slabs, which is more efficient
  - this includes cases where only some cpus are allowed to process rcu
    callbacks (Android)

Possible disadvantage:
- objects might be waiting for more than their grace period (it is
  determined by the last object freed into the sheaf), increasing memory
  usage - but the existing batching does that too.

Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
implementation favors smaller memory footprint over performance.

Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
contexts where kfree_rcu() is called might not be compatible with taking
a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
spinlock - the current kfree_rcu() implementation avoids doing that.

Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
that have them. This is not a cheap operation, but the barrier usage is
rare - currently kmem_cache_destroy() or on module unload.

Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
count how many kfree_rcu() used the rcu_free sheaf successfully and how
many had to fall back to the existing implementation.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab.h        |   3 +
 mm/slab_common.c |  26 +++++
 mm/slub.c        | 267 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 294 insertions(+), 2 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index 206987ce44a4..e82e51c44bd0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -435,6 +435,9 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
 	return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
 }
 
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
+void flush_all_rcu_sheaves(void);
+
 #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
 			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
 			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e2b197e47866..005a4319c06a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1608,6 +1608,27 @@ static void kfree_rcu_work(struct work_struct *work)
 		kvfree_rcu_list(head);
 }
 
+static bool kfree_rcu_sheaf(void *obj)
+{
+	struct kmem_cache *s;
+	struct folio *folio;
+	struct slab *slab;
+
+	if (is_vmalloc_addr(obj))
+		return false;
+
+	folio = virt_to_folio(obj);
+	if (unlikely(!folio_test_slab(folio)))
+		return false;
+
+	slab = folio_slab(folio);
+	s = slab->slab_cache;
+	if (s->cpu_sheaves)
+		return __kfree_rcu_sheaf(s, obj);
+
+	return false;
+}
+
 static bool
 need_offload_krc(struct kfree_rcu_cpu *krcp)
 {
@@ -1952,6 +1973,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
 	if (!head)
 		might_sleep();
 
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr))
+		return;
+
 	// Queue the object but don't yet schedule the batch.
 	if (debug_rcu_head_queue(ptr)) {
 		// Probable double kfree_rcu(), just leak.
@@ -2026,6 +2050,8 @@ void kvfree_rcu_barrier(void)
 	bool queued;
 	int i, cpu;
 
+	flush_all_rcu_sheaves();
+
 	/*
 	 * Firstly we detach objects and queue them over an RCU-batch
 	 * for all CPUs. Finally queued works are flushed for each CPU.
diff --git a/mm/slub.c b/mm/slub.c
index cba188b7e04d..171273f90efd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -367,6 +367,8 @@ enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
 	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
 	FREE_PCS,		/* Free to percpu sheaf */
+	FREE_RCU_SHEAF,		/* Free to rcu_free sheaf */
+	FREE_RCU_SHEAF_FAIL,	/* Failed to free to a rcu_free sheaf */
 	FREE_FASTPATH,		/* Free to cpu slab */
 	FREE_SLOWPATH,		/* Freeing not to cpu slab */
 	FREE_FROZEN,		/* Freeing to frozen slab */
@@ -461,6 +463,7 @@ struct slab_sheaf {
 		struct rcu_head rcu_head;
 		struct list_head barn_list;
 	};
+	struct kmem_cache *cache;
 	unsigned int size;
 	void *objects[];
 };
@@ -469,6 +472,7 @@ struct slub_percpu_sheaves {
 	local_trylock_t lock;
 	struct slab_sheaf *main; /* never NULL when unlocked */
 	struct slab_sheaf *spare; /* empty or full, may be NULL */
+	struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
 };
 
 /*
@@ -2531,6 +2535,8 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
 	if (unlikely(!sheaf))
 		return NULL;
 
+	sheaf->cache = s;
+
 	stat(s, SHEAF_ALLOC);
 
 	return sheaf;
@@ -2655,6 +2661,43 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
 	sheaf->size = 0;
 }
 
+static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
+				     struct slab_sheaf *sheaf)
+{
+	bool init = slab_want_init_on_free(s);
+	void **p = &sheaf->objects[0];
+	unsigned int i = 0;
+
+	while (i < sheaf->size) {
+		struct slab *slab = virt_to_slab(p[i]);
+
+		memcg_slab_free_hook(s, slab, p + i, 1);
+		alloc_tagging_slab_free_hook(s, slab, p + i, 1);
+
+		if (unlikely(!slab_free_hook(s, p[i], init, true))) {
+			p[i] = p[--sheaf->size];
+			continue;
+		}
+
+		i++;
+	}
+}
+
+static void rcu_free_sheaf_nobarn(struct rcu_head *head)
+{
+	struct slab_sheaf *sheaf;
+	struct kmem_cache *s;
+
+	sheaf = container_of(head, struct slab_sheaf, rcu_head);
+	s = sheaf->cache;
+
+	__rcu_free_sheaf_prepare(s, sheaf);
+
+	sheaf_flush_unused(s, sheaf);
+
+	free_empty_sheaf(s, sheaf);
+}
+
 /*
  * Caller needs to make sure migration is disabled in order to fully flush
  * single cpu's sheaves
@@ -2667,7 +2710,7 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
 static void pcs_flush_all(struct kmem_cache *s)
 {
 	struct slub_percpu_sheaves *pcs;
-	struct slab_sheaf *spare;
+	struct slab_sheaf *spare, *rcu_free;
 
 	local_lock(&s->cpu_sheaves->lock);
 	pcs = this_cpu_ptr(s->cpu_sheaves);
@@ -2675,6 +2718,9 @@ static void pcs_flush_all(struct kmem_cache *s)
 	spare = pcs->spare;
 	pcs->spare = NULL;
 
+	rcu_free = pcs->rcu_free;
+	pcs->rcu_free = NULL;
+
 	local_unlock(&s->cpu_sheaves->lock);
 
 	if (spare) {
@@ -2682,6 +2728,9 @@ static void pcs_flush_all(struct kmem_cache *s)
 		free_empty_sheaf(s, spare);
 	}
 
+	if (rcu_free)
+		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+
 	sheaf_flush_main(s);
 }
 
@@ -2698,6 +2747,11 @@ static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
 		free_empty_sheaf(s, pcs->spare);
 		pcs->spare = NULL;
 	}
+
+	if (pcs->rcu_free) {
+		call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+		pcs->rcu_free = NULL;
+	}
 }
 
 static void pcs_destroy(struct kmem_cache *s)
@@ -2723,6 +2777,7 @@ static void pcs_destroy(struct kmem_cache *s)
 		 */
 
 		WARN_ON(pcs->spare);
+		WARN_ON(pcs->rcu_free);
 
 		if (!WARN_ON(pcs->main->size)) {
 			free_empty_sheaf(s, pcs->main);
@@ -3780,7 +3835,7 @@ static bool has_pcs_used(int cpu, struct kmem_cache *s)
 
 	pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 
-	return (pcs->spare || pcs->main->size);
+	return (pcs->spare || pcs->rcu_free || pcs->main->size);
 }
 
 /*
@@ -3840,6 +3895,77 @@ static void flush_all(struct kmem_cache *s)
 	cpus_read_unlock();
 }
 
+static void flush_rcu_sheaf(struct work_struct *w)
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slab_sheaf *rcu_free;
+	struct slub_flush_work *sfw;
+	struct kmem_cache *s;
+
+	sfw = container_of(w, struct slub_flush_work, work);
+	s = sfw->s;
+
+	local_lock(&s->cpu_sheaves->lock);
+	pcs = this_cpu_ptr(s->cpu_sheaves);
+
+	rcu_free = pcs->rcu_free;
+	pcs->rcu_free = NULL;
+
+	local_unlock(&s->cpu_sheaves->lock);
+
+	if (rcu_free)
+		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+}
+
+
+/* needed for kvfree_rcu_barrier() */
+void flush_all_rcu_sheaves(void)
+{
+	struct slub_flush_work *sfw;
+	struct kmem_cache *s;
+	unsigned int cpu;
+
+	cpus_read_lock();
+	mutex_lock(&slab_mutex);
+
+	list_for_each_entry(s, &slab_caches, list) {
+		if (!s->cpu_sheaves)
+			continue;
+
+		mutex_lock(&flush_lock);
+
+		for_each_online_cpu(cpu) {
+			sfw = &per_cpu(slub_flush, cpu);
+
+			/*
+			 * we don't check if rcu_free sheaf exists - racing
+			 * __kfree_rcu_sheaf() might have just removed it.
+			 * by executing flush_rcu_sheaf() on the cpu we make
+			 * sure the __kfree_rcu_sheaf() finished its call_rcu()
+			 */
+
+			INIT_WORK(&sfw->work, flush_rcu_sheaf);
+			sfw->skip = false;
+			sfw->s = s;
+			queue_work_on(cpu, flushwq, &sfw->work);
+		}
+
+		for_each_online_cpu(cpu) {
+			sfw = &per_cpu(slub_flush, cpu);
+			if (sfw->skip)
+				continue;
+			flush_work(&sfw->work);
+		}
+
+		mutex_unlock(&flush_lock);
+	}
+
+	mutex_unlock(&slab_mutex);
+	cpus_read_unlock();
+
+	rcu_barrier();
+}
+
 /*
  * Use the cpu notifier to insure that the cpu slabs are flushed when
  * necessary.
@@ -5413,6 +5539,134 @@ bool free_to_pcs(struct kmem_cache *s, void *object)
 	return true;
 }
 
+static void rcu_free_sheaf(struct rcu_head *head)
+{
+	struct slab_sheaf *sheaf;
+	struct node_barn *barn;
+	struct kmem_cache *s;
+
+	sheaf = container_of(head, struct slab_sheaf, rcu_head);
+
+	s = sheaf->cache;
+
+	/*
+	 * This may remove some objects due to slab_free_hook() returning false,
+	 * so that the sheaf might no longer be completely full. But it's easier
+	 * to handle it as full (unless it became completely empty), as the code
+	 * handles it fine. The only downside is that sheaf will serve fewer
+	 * allocations when reused. It only happens due to debugging, which is a
+	 * performance hit anyway.
+	 */
+	__rcu_free_sheaf_prepare(s, sheaf);
+
+	barn = get_node(s, numa_mem_id())->barn;
+
+	/* due to slab_free_hook() */
+	if (unlikely(sheaf->size == 0))
+		goto empty;
+
+	/*
+	 * Checking nr_full/nr_empty outside lock avoids contention in case the
+	 * barn is at the respective limit. Due to the race we might go over the
+	 * limit but that should be rare and harmless.
+	 */
+
+	if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) {
+		stat(s, BARN_PUT);
+		barn_put_full_sheaf(barn, sheaf);
+		return;
+	}
+
+	stat(s, BARN_PUT_FAIL);
+	sheaf_flush_unused(s, sheaf);
+
+empty:
+	if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
+		barn_put_empty_sheaf(barn, sheaf);
+		return;
+	}
+
+	free_empty_sheaf(s, sheaf);
+}
+
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slab_sheaf *rcu_sheaf;
+
+	if (!local_trylock(&s->cpu_sheaves->lock))
+		goto fail;
+
+	pcs = this_cpu_ptr(s->cpu_sheaves);
+
+	if (unlikely(!pcs->rcu_free)) {
+
+		struct slab_sheaf *empty;
+		struct node_barn *barn;
+
+		if (pcs->spare && pcs->spare->size == 0) {
+			pcs->rcu_free = pcs->spare;
+			pcs->spare = NULL;
+			goto do_free;
+		}
+
+		barn = get_barn(s);
+
+		empty = barn_get_empty_sheaf(barn);
+
+		if (empty) {
+			pcs->rcu_free = empty;
+			goto do_free;
+		}
+
+		local_unlock(&s->cpu_sheaves->lock);
+
+		empty = alloc_empty_sheaf(s, GFP_NOWAIT);
+
+		if (!empty)
+			goto fail;
+
+		if (!local_trylock(&s->cpu_sheaves->lock)) {
+			barn_put_empty_sheaf(barn, empty);
+			goto fail;
+		}
+
+		pcs = this_cpu_ptr(s->cpu_sheaves);
+
+		if (unlikely(pcs->rcu_free))
+			barn_put_empty_sheaf(barn, empty);
+		else
+			pcs->rcu_free = empty;
+	}
+
+do_free:
+
+	rcu_sheaf = pcs->rcu_free;
+
+	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
+
+	if (likely(rcu_sheaf->size < s->sheaf_capacity))
+		rcu_sheaf = NULL;
+	else
+		pcs->rcu_free = NULL;
+
+	/*
+	 * we flush before local_unlock to make sure a racing
+	 * flush_all_rcu_sheaves() doesn't miss this sheaf
+	 */
+	if (rcu_sheaf)
+		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
+
+	local_unlock(&s->cpu_sheaves->lock);
+
+	stat(s, FREE_RCU_SHEAF);
+	return true;
+
+fail:
+	stat(s, FREE_RCU_SHEAF_FAIL);
+	return false;
+}
+
 /*
  * Bulk free objects to the percpu sheaves.
  * Unlike free_to_pcs() this includes the calls to all necessary hooks
@@ -6909,6 +7163,11 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 	struct kmem_cache_node *n;
 
 	flush_all_cpus_locked(s);
+
+	/* we might have rcu sheaves in flight */
+	if (s->cpu_sheaves)
+		rcu_barrier();
+
 	/* Attempt to free all objects */
 	for_each_kmem_cache_node(s, node, n) {
 		if (n->barn)
@@ -8284,6 +8543,8 @@ STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf);
 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
 STAT_ATTR(FREE_PCS, free_cpu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail);
 STAT_ATTR(FREE_FASTPATH, free_fastpath);
 STAT_ATTR(FREE_SLOWPATH, free_slowpath);
 STAT_ATTR(FREE_FROZEN, free_frozen);
@@ -8382,6 +8643,8 @@ static struct attribute *slab_attrs[] = {
 	&alloc_fastpath_attr.attr,
 	&alloc_slowpath_attr.attr,
 	&free_cpu_sheaf_attr.attr,
+	&free_rcu_sheaf_attr.attr,
+	&free_rcu_sheaf_fail_attr.attr,
 	&free_fastpath_attr.attr,
 	&free_slowpath_attr.attr,
 	&free_frozen_attr.attr,
-- 
2.51.0
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Harry Yoo 4 months, 3 weeks ago
On Thu, Sep 18, 2025 at 10:09:34AM +0200, Vlastimil Babka wrote:
> On 9/17/25 16:14, Vlastimil Babka wrote:
> > On 9/17/25 15:34, Harry Yoo wrote:
> >> On Wed, Sep 17, 2025 at 03:21:31PM +0200, Vlastimil Babka wrote:
> >>> On 9/17/25 15:07, Harry Yoo wrote:
> >>> > On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
> >>> >> On 9/17/25 13:32, Harry Yoo wrote:
> >>> >> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
> >>> >> >> On 9/17/25 10:30, Harry Yoo wrote:
> >>> >> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> >>> >> >> >> +				sfw->skip = true;
> >>> >> >> >> +				continue;
> >>> >> >> >> +			}
> >>> >> >> >>
> >>> >> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> >>> >> >> >> +			sfw->skip = false;
> >>> >> >> >> +			sfw->s = s;
> >>> >> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
> >>> >> >> >> +			flushed = true;
> >>> >> >> >> +		}
> >>> >> >> >> +
> >>> >> >> >> +		for_each_online_cpu(cpu) {
> >>> >> >> >> +			sfw = &per_cpu(slub_flush, cpu);
> >>> >> >> >> +			if (sfw->skip)
> >>> >> >> >> +				continue;
> >>> >> >> >> +			flush_work(&sfw->work);
> >>> >> >> >> +		}
> >>> >> >> >> +
> >>> >> >> >> +		mutex_unlock(&flush_lock);
> >>> >> >> >> +	}
> >>> >> >> >> +
> >>> >> >> >> +	mutex_unlock(&slab_mutex);
> >>> >> >> >> +	cpus_read_unlock();
> >>> >> >> >> +
> >>> >> >> >> +	if (flushed)
> >>> >> >> >> +		rcu_barrier();
> >>> >> >> > 
> >>> >> >> > I think we need to call rcu_barrier() even if flushed == false?
> >>> >> >> > 
> >>> >> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> >>> >> >> > be processed before flush_all_rcu_sheaves() is called, and
> >>> >> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> >>> >> >> > so flushed == false but the rcu callback isn't processed yet
> >>> >> >> > by the end of the function?
> >>> >> >> > 
> >>> >> >> > That sounds like a very unlikely to happen in a realistic scenario,
> >>> >> >> > but still possible...
> >>> >> >> 
> >>> >> >> Yes also good point, will flush unconditionally.
> >>> >> >> 
> >>> >> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
> >>> >> >> local_unlock().
> >>> >> >>
> >>> >> >> So we don't end up seeing a NULL pcs->rcu_free in
> >>> >> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
> >>> >> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
> >>> >> > 
> >>> >> > Makes sense to me.
> >>> > 
> >>> > Wait, I'm confused.
> >>> > 
> >>> > I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
> >>> > only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
> >>> > the object X to be freed before kvfree_rcu_barrier() returns?
> >>> 
> >>> Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
> >>> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
> >>> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
> >>> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
> >>> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
> >>> up the rcu_sheaf fully and is about to call_rcu() on it. And since that
> >>> sheaf also contains the object X, we should make sure that is flushed.
> >> 
> >> I was going to say "but we queue and wait for the flushing work to
> >> complete, so the sheaf containing object X should be flushed?"
> >> 
> >> But nah, that's true only if we see pcs->rcu_free != NULL in
> >> flush_all_rcu_sheaves().
> >> 
> >> You are right...
> >> 
> >> Hmm, maybe it's simpler to fix this by never skipping queueing the work
> >> even when pcs->rcu_sheaf == NULL?
> > 
> > I guess it's simpler, yeah.
> 
> So what about this? The unconditional queueing should cover all races with
> __kfree_rcu_sheaf() so there's just unconditional rcu_barrier() in the end.
> 
> From 0722b29fa1625b31c05d659d1d988ec882247b38 Mon Sep 17 00:00:00 2001
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Wed, 3 Sep 2025 14:59:46 +0200
> Subject: [PATCH] slab: add sheaf support for batching kfree_rcu() operations
> 
> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> addition to main and spare sheaves.
> 
> kfree_rcu() operations will try to put objects on this sheaf. Once full,
> the sheaf is detached and submitted to call_rcu() with a handler that
> will try to put it in the barn, or flush to slab pages using bulk free,
> when the barn is full. Then a new empty sheaf must be obtained to put
> more objects there.
> 
> It's possible that no free sheaves are available to use for a new
> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> kfree_rcu() implementation.
> 
> Expected advantages:
> - batching the kfree_rcu() operations, that could eventually replace the
>   existing batching
> - sheaves can be reused for allocations via barn instead of being
>   flushed to slabs, which is more efficient
>   - this includes cases where only some cpus are allowed to process rcu
>     callbacks (Android)
> 
> Possible disadvantage:
> - objects might be waiting for more than their grace period (it is
>   determined by the last object freed into the sheaf), increasing memory
>   usage - but the existing batching does that too.
> 
> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> implementation favors smaller memory footprint over performance.
> 
> Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
> contexts where kfree_rcu() is called might not be compatible with taking
> a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
> spinlock - the current kfree_rcu() implementation avoids doing that.
> 
> Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
> that have them. This is not a cheap operation, but the barrier usage is
> rare - currently kmem_cache_destroy() or on module unload.
> 
> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> count how many kfree_rcu() used the rcu_free sheaf successfully and how
> many had to fall back to the existing implementation.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---

Looks good to me,
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>

with a nit:

> +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
> +{
> +	struct slub_percpu_sheaves *pcs;
> +	struct slab_sheaf *rcu_sheaf;
> +
> +	if (!local_trylock(&s->cpu_sheaves->lock))
> +		goto fail;
> +
> +	pcs = this_cpu_ptr(s->cpu_sheaves);
> +
> +	if (unlikely(!pcs->rcu_free)) {
> +
> +		struct slab_sheaf *empty;
> +		struct node_barn *barn;
> +
> +		if (pcs->spare && pcs->spare->size == 0) {
> +			pcs->rcu_free = pcs->spare;
> +			pcs->spare = NULL;
> +			goto do_free;
> +		}
> +
> +		barn = get_barn(s);
> +
> +		empty = barn_get_empty_sheaf(barn);
> +
> +		if (empty) {
> +			pcs->rcu_free = empty;
> +			goto do_free;
> +		}
> +
> +		local_unlock(&s->cpu_sheaves->lock);
> +
> +		empty = alloc_empty_sheaf(s, GFP_NOWAIT);
> +
> +		if (!empty)
> +			goto fail;
> +
> +		if (!local_trylock(&s->cpu_sheaves->lock)) {
> +			barn_put_empty_sheaf(barn, empty);
> +			goto fail;
> +		}
> +
> +		pcs = this_cpu_ptr(s->cpu_sheaves);
> +
> +		if (unlikely(pcs->rcu_free))
> +			barn_put_empty_sheaf(barn, empty);
> +		else
> +			pcs->rcu_free = empty;
> +	}
> +
> +do_free:
> +
> +	rcu_sheaf = pcs->rcu_free;
> +
> +	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
> +
> +	if (likely(rcu_sheaf->size < s->sheaf_capacity))
> +		rcu_sheaf = NULL;
> +	else
> +		pcs->rcu_free = NULL;
> +
> +	/*
> +	 * we flush before local_unlock to make sure a racing
> +	 * flush_all_rcu_sheaves() doesn't miss this sheaf
> +	 */
> +	if (rcu_sheaf)
> +		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);

nit: now we don't have to put this inside local_lock()~local_unlock()?

-- 
Cheers,
Harry / Hyeonggon

> +	local_unlock(&s->cpu_sheaves->lock);
> +
> +	stat(s, FREE_RCU_SHEAF);
> +	return true;
> +
> +fail:
> +	stat(s, FREE_RCU_SHEAF_FAIL);
> +	return false;
> +}
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Vlastimil Babka 4 months, 3 weeks ago
On 9/19/25 08:47, Harry Yoo wrote:
> On Thu, Sep 18, 2025 at 10:09:34AM +0200, Vlastimil Babka wrote:
>> On 9/17/25 16:14, Vlastimil Babka wrote:
>> > On 9/17/25 15:34, Harry Yoo wrote:
>> >> On Wed, Sep 17, 2025 at 03:21:31PM +0200, Vlastimil Babka wrote:
>> >>> On 9/17/25 15:07, Harry Yoo wrote:
>> >>> > On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
>> >>> >> On 9/17/25 13:32, Harry Yoo wrote:
>> >>> >> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
>> >>> >> >> On 9/17/25 10:30, Harry Yoo wrote:
>> >>> >> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>> >>> >> >> >> +				sfw->skip = true;
>> >>> >> >> >> +				continue;
>> >>> >> >> >> +			}
>> >>> >> >> >>
>> >>> >> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>> >>> >> >> >> +			sfw->skip = false;
>> >>> >> >> >> +			sfw->s = s;
>> >>> >> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
>> >>> >> >> >> +			flushed = true;
>> >>> >> >> >> +		}
>> >>> >> >> >> +
>> >>> >> >> >> +		for_each_online_cpu(cpu) {
>> >>> >> >> >> +			sfw = &per_cpu(slub_flush, cpu);
>> >>> >> >> >> +			if (sfw->skip)
>> >>> >> >> >> +				continue;
>> >>> >> >> >> +			flush_work(&sfw->work);
>> >>> >> >> >> +		}
>> >>> >> >> >> +
>> >>> >> >> >> +		mutex_unlock(&flush_lock);
>> >>> >> >> >> +	}
>> >>> >> >> >> +
>> >>> >> >> >> +	mutex_unlock(&slab_mutex);
>> >>> >> >> >> +	cpus_read_unlock();
>> >>> >> >> >> +
>> >>> >> >> >> +	if (flushed)
>> >>> >> >> >> +		rcu_barrier();
>> >>> >> >> > 
>> >>> >> >> > I think we need to call rcu_barrier() even if flushed == false?
>> >>> >> >> > 
>> >>> >> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
>> >>> >> >> > be processed before flush_all_rcu_sheaves() is called, and
>> >>> >> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
>> >>> >> >> > so flushed == false but the rcu callback isn't processed yet
>> >>> >> >> > by the end of the function?
>> >>> >> >> > 
>> >>> >> >> > That sounds like a very unlikely to happen in a realistic scenario,
>> >>> >> >> > but still possible...
>> >>> >> >> 
>> >>> >> >> Yes also good point, will flush unconditionally.
>> >>> >> >> 
>> >>> >> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
>> >>> >> >> local_unlock().
>> >>> >> >>
>> >>> >> >> So we don't end up seeing a NULL pcs->rcu_free in
>> >>> >> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
>> >>> >> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
>> >>> >> > 
>> >>> >> > Makes sense to me.
>> >>> > 
>> >>> > Wait, I'm confused.
>> >>> > 
>> >>> > I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
>> >>> > only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
>> >>> > the object X to be freed before kvfree_rcu_barrier() returns?
>> >>> 
>> >>> Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
>> >>> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
>> >>> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
>> >>> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
>> >>> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
>> >>> up the rcu_sheaf fully and is about to call_rcu() on it. And since that
>> >>> sheaf also contains the object X, we should make sure that is flushed.
>> >> 
>> >> I was going to say "but we queue and wait for the flushing work to
>> >> complete, so the sheaf containing object X should be flushed?"
>> >> 
>> >> But nah, that's true only if we see pcs->rcu_free != NULL in
>> >> flush_all_rcu_sheaves().
>> >> 
>> >> You are right...
>> >> 
>> >> Hmm, maybe it's simpler to fix this by never skipping queueing the work
>> >> even when pcs->rcu_sheaf == NULL?
>> > 
>> > I guess it's simpler, yeah.
>> 
>> So what about this? The unconditional queueing should cover all races with
>> __kfree_rcu_sheaf() so there's just unconditional rcu_barrier() in the end.
>> 
>> From 0722b29fa1625b31c05d659d1d988ec882247b38 Mon Sep 17 00:00:00 2001
>> From: Vlastimil Babka <vbabka@suse.cz>
>> Date: Wed, 3 Sep 2025 14:59:46 +0200
>> Subject: [PATCH] slab: add sheaf support for batching kfree_rcu() operations
>> 
>> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
>> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
>> addition to main and spare sheaves.
>> 
>> kfree_rcu() operations will try to put objects on this sheaf. Once full,
>> the sheaf is detached and submitted to call_rcu() with a handler that
>> will try to put it in the barn, or flush to slab pages using bulk free,
>> when the barn is full. Then a new empty sheaf must be obtained to put
>> more objects there.
>> 
>> It's possible that no free sheaves are available to use for a new
>> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
>> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
>> kfree_rcu() implementation.
>> 
>> Expected advantages:
>> - batching the kfree_rcu() operations, that could eventually replace the
>>   existing batching
>> - sheaves can be reused for allocations via barn instead of being
>>   flushed to slabs, which is more efficient
>>   - this includes cases where only some cpus are allowed to process rcu
>>     callbacks (Android)
>> 
>> Possible disadvantage:
>> - objects might be waiting for more than their grace period (it is
>>   determined by the last object freed into the sheaf), increasing memory
>>   usage - but the existing batching does that too.
>> 
>> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
>> implementation favors smaller memory footprint over performance.
>> 
>> Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
>> contexts where kfree_rcu() is called might not be compatible with taking
>> a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
>> spinlock - the current kfree_rcu() implementation avoids doing that.
>> 
>> Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
>> that have them. This is not a cheap operation, but the barrier usage is
>> rare - currently kmem_cache_destroy() or on module unload.
>> 
>> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
>> count how many kfree_rcu() used the rcu_free sheaf successfully and how
>> many had to fall back to the existing implementation.
>> 
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> ---
> 
> Looks good to me,
> Reviewed-by: Harry Yoo <harry.yoo@oracle.com>

Thanks.

>> +do_free:
>> +
>> +	rcu_sheaf = pcs->rcu_free;
>> +
>> +	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
>> +
>> +	if (likely(rcu_sheaf->size < s->sheaf_capacity))
>> +		rcu_sheaf = NULL;
>> +	else
>> +		pcs->rcu_free = NULL;
>> +
>> +	/*
>> +	 * we flush before local_unlock to make sure a racing
>> +	 * flush_all_rcu_sheaves() doesn't miss this sheaf
>> +	 */
>> +	if (rcu_sheaf)
>> +		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
> 
> nit: now we don't have to put this inside local_lock()~local_unlock()?

I think we still need to? AFAICS I wrote before is still true:

The caller of kvfree_rcu(X, rhs) might have returned without filling up
the rcu_sheaf fully and thus without submitting it to call_rcu(), then
migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
up the rcu_sheaf fully and is about to call_rcu() on it.

If it can local_unlock() before doing the call_rcu(), it can local_unlock(),
get preempted, and our flush worqueue handler will only see there's no
rcu_free sheaf and do nothing.

If if must call_rcu() before local_unlock(), our flush workqueue handler
will not execute on the cpu until it performs the call_rcu() and
local_unlock(), because it can't preempt that section (!RT) or will have to
wait doing local_lock() in flush_rcu_sheaf() (RT) - here it's important it
takes the lock unconditionally.

Or am I missing something?
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Harry Yoo 4 months, 3 weeks ago
On Fri, Sep 19, 2025 at 09:02:22AM +0200, Vlastimil Babka wrote:
> On 9/19/25 08:47, Harry Yoo wrote:
> > On Thu, Sep 18, 2025 at 10:09:34AM +0200, Vlastimil Babka wrote:
> >> On 9/17/25 16:14, Vlastimil Babka wrote:
> >> > On 9/17/25 15:34, Harry Yoo wrote:
> >> >> On Wed, Sep 17, 2025 at 03:21:31PM +0200, Vlastimil Babka wrote:
> >> >>> On 9/17/25 15:07, Harry Yoo wrote:
> >> >>> > On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
> >> >>> >> On 9/17/25 13:32, Harry Yoo wrote:
> >> >>> >> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
> >> >>> >> >> On 9/17/25 10:30, Harry Yoo wrote:
> >> >>> >> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> >> >>> >> >> >> +				sfw->skip = true;
> >> >>> >> >> >> +				continue;
> >> >>> >> >> >> +			}
> >> >>> >> >> >>
> >> >>> >> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> >> >>> >> >> >> +			sfw->skip = false;
> >> >>> >> >> >> +			sfw->s = s;
> >> >>> >> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
> >> >>> >> >> >> +			flushed = true;
> >> >>> >> >> >> +		}
> >> >>> >> >> >> +
> >> >>> >> >> >> +		for_each_online_cpu(cpu) {
> >> >>> >> >> >> +			sfw = &per_cpu(slub_flush, cpu);
> >> >>> >> >> >> +			if (sfw->skip)
> >> >>> >> >> >> +				continue;
> >> >>> >> >> >> +			flush_work(&sfw->work);
> >> >>> >> >> >> +		}
> >> >>> >> >> >> +
> >> >>> >> >> >> +		mutex_unlock(&flush_lock);
> >> >>> >> >> >> +	}
> >> >>> >> >> >> +
> >> >>> >> >> >> +	mutex_unlock(&slab_mutex);
> >> >>> >> >> >> +	cpus_read_unlock();
> >> >>> >> >> >> +
> >> >>> >> >> >> +	if (flushed)
> >> >>> >> >> >> +		rcu_barrier();
> >> >>> >> >> > 
> >> >>> >> >> > I think we need to call rcu_barrier() even if flushed == false?
> >> >>> >> >> > 
> >> >>> >> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> >> >>> >> >> > be processed before flush_all_rcu_sheaves() is called, and
> >> >>> >> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> >> >>> >> >> > so flushed == false but the rcu callback isn't processed yet
> >> >>> >> >> > by the end of the function?
> >> >>> >> >> > 
> >> >>> >> >> > That sounds like a very unlikely to happen in a realistic scenario,
> >> >>> >> >> > but still possible...
> >> >>> >> >> 
> >> >>> >> >> Yes also good point, will flush unconditionally.
> >> >>> >> >> 
> >> >>> >> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
> >> >>> >> >> local_unlock().
> >> >>> >> >>
> >> >>> >> >> So we don't end up seeing a NULL pcs->rcu_free in
> >> >>> >> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
> >> >>> >> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
> >> >>> >> > 
> >> >>> >> > Makes sense to me.
> >> >>> > 
> >> >>> > Wait, I'm confused.
> >> >>> > 
> >> >>> > I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
> >> >>> > only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
> >> >>> > the object X to be freed before kvfree_rcu_barrier() returns?
> >> >>> 
> >> >>> Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
> >> >>> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
> >> >>> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
> >> >>> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
> >> >>> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
> >> >>> up the rcu_sheaf fully and is about to call_rcu() on it. And since that
> >> >>> sheaf also contains the object X, we should make sure that is flushed.
> >> >> 
> >> >> I was going to say "but we queue and wait for the flushing work to
> >> >> complete, so the sheaf containing object X should be flushed?"
> >> >> 
> >> >> But nah, that's true only if we see pcs->rcu_free != NULL in
> >> >> flush_all_rcu_sheaves().
> >> >> 
> >> >> You are right...
> >> >> 
> >> >> Hmm, maybe it's simpler to fix this by never skipping queueing the work
> >> >> even when pcs->rcu_sheaf == NULL?
> >> > 
> >> > I guess it's simpler, yeah.
> >> 
> >> So what about this? The unconditional queueing should cover all races with
> >> __kfree_rcu_sheaf() so there's just unconditional rcu_barrier() in the end.
> >> 
> >> From 0722b29fa1625b31c05d659d1d988ec882247b38 Mon Sep 17 00:00:00 2001
> >> From: Vlastimil Babka <vbabka@suse.cz>
> >> Date: Wed, 3 Sep 2025 14:59:46 +0200
> >> Subject: [PATCH] slab: add sheaf support for batching kfree_rcu() operations
> >> 
> >> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> >> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> >> addition to main and spare sheaves.
> >> 
> >> kfree_rcu() operations will try to put objects on this sheaf. Once full,
> >> the sheaf is detached and submitted to call_rcu() with a handler that
> >> will try to put it in the barn, or flush to slab pages using bulk free,
> >> when the barn is full. Then a new empty sheaf must be obtained to put
> >> more objects there.
> >> 
> >> It's possible that no free sheaves are available to use for a new
> >> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> >> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> >> kfree_rcu() implementation.
> >> 
> >> Expected advantages:
> >> - batching the kfree_rcu() operations, that could eventually replace the
> >>   existing batching
> >> - sheaves can be reused for allocations via barn instead of being
> >>   flushed to slabs, which is more efficient
> >>   - this includes cases where only some cpus are allowed to process rcu
> >>     callbacks (Android)
> >> 
> >> Possible disadvantage:
> >> - objects might be waiting for more than their grace period (it is
> >>   determined by the last object freed into the sheaf), increasing memory
> >>   usage - but the existing batching does that too.
> >> 
> >> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> >> implementation favors smaller memory footprint over performance.
> >> 
> >> Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
> >> contexts where kfree_rcu() is called might not be compatible with taking
> >> a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
> >> spinlock - the current kfree_rcu() implementation avoids doing that.
> >> 
> >> Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
> >> that have them. This is not a cheap operation, but the barrier usage is
> >> rare - currently kmem_cache_destroy() or on module unload.
> >> 
> >> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> >> count how many kfree_rcu() used the rcu_free sheaf successfully and how
> >> many had to fall back to the existing implementation.
> >> 
> >> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >> ---
> > 
> > Looks good to me,
> > Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
> 
> Thanks.
> 
> >> +do_free:
> >> +
> >> +	rcu_sheaf = pcs->rcu_free;
> >> +
> >> +	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
> >> +
> >> +	if (likely(rcu_sheaf->size < s->sheaf_capacity))
> >> +		rcu_sheaf = NULL;
> >> +	else
> >> +		pcs->rcu_free = NULL;
> >> +
> >> +	/*
> >> +	 * we flush before local_unlock to make sure a racing
> >> +	 * flush_all_rcu_sheaves() doesn't miss this sheaf
> >> +	 */
> >> +	if (rcu_sheaf)
> >> +		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
> > 
> > nit: now we don't have to put this inside local_lock()~local_unlock()?
> 
> I think we still need to? AFAICS I wrote before is still true:
> 
> The caller of kvfree_rcu(X, rhs) might have returned without filling up
> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
> up the rcu_sheaf fully and is about to call_rcu() on it.
>
> If it can local_unlock() before doing the call_rcu(), it can local_unlock(),
> get preempted, and our flush worqueue handler will only see there's no
> rcu_free sheaf and do nothing.

Oops, you're right. So even if a previous kvfree_rcu() has returned
and then kvfree_rcu_barrier() is called, a later kvfree_rcu() call can
make the sheaf invisible to the flush workqueue handler if it calls
call_rcu() outside the critical section because it can be preempted by
the workqueue handler after local_unlock() but before calling
call_rcu().

> If if must call_rcu() before local_unlock(), our flush workqueue handler
> will not execute on the cpu until it performs the call_rcu() and
> local_unlock(), because it can't preempt that section (!RT) or will have to
> wait doing local_lock() in flush_rcu_sheaf() (RT) - here it's important it
> takes the lock unconditionally.

Right.

My nit was wrong and it looks good to me then!

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Sergey Senozhatsky 5 months ago
Hi Vlastimil,

On (25/09/10 10:01), Vlastimil Babka wrote:
[..]
> +
> +	if (rcu_free)
> +		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
> +}
> +
> +
> +/* needed for kvfree_rcu_barrier() */
> +void flush_all_rcu_sheaves()
> +{

mm/slub.c:3960:27: error: a function declaration without a prototype is deprecated in all versions of C [-Werror,-Wstrict-prototypes]
 3960 | void flush_all_rcu_sheaves()
      |                           ^
      |                            void

---

diff --git a/mm/slub.c b/mm/slub.c
index 11ad4173e2f2..a1eae71a0f8c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3955,9 +3955,8 @@ static void flush_rcu_sheaf(struct work_struct *w)
 		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
 }
 
-
 /* needed for kvfree_rcu_barrier() */
-void flush_all_rcu_sheaves()
+void flush_all_rcu_sheaves(void)
 {
 	struct slub_percpu_sheaves *pcs;
 	struct slub_flush_work *sfw;
Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations
Posted by Vlastimil Babka 5 months ago
On 9/12/25 02:38, Sergey Senozhatsky wrote:
> Hi Vlastimil,
> 
> On (25/09/10 10:01), Vlastimil Babka wrote:
> [..]
>> +
>> +	if (rcu_free)
>> +		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
>> +}
>> +
>> +
>> +/* needed for kvfree_rcu_barrier() */
>> +void flush_all_rcu_sheaves()
>> +{
> 
> mm/slub.c:3960:27: error: a function declaration without a prototype is deprecated in all versions of C [-Werror,-Wstrict-prototypes]
>  3960 | void flush_all_rcu_sheaves()
>       |                           ^
>       |                            void
> 
> ---

Thanks, the bots told me too and it's fixed in -next

> diff --git a/mm/slub.c b/mm/slub.c
> index 11ad4173e2f2..a1eae71a0f8c 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -3955,9 +3955,8 @@ static void flush_rcu_sheaf(struct work_struct *w)
>  		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
>  }
>  
> -
>  /* needed for kvfree_rcu_barrier() */
> -void flush_all_rcu_sheaves()
> +void flush_all_rcu_sheaves(void)
>  {
>  	struct slub_percpu_sheaves *pcs;
>  	struct slub_flush_work *sfw;