[v8] SLUB percpu sheaves

[PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Vlastimil Babka 3 weeks, 1 day ago

Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
For caches with sheaves, on each cpu maintain a rcu_free sheaf in
addition to main and spare sheaves.

kfree_rcu() operations will try to put objects on this sheaf. Once full,
the sheaf is detached and submitted to call_rcu() with a handler that
will try to put it in the barn, or flush to slab pages using bulk free,
when the barn is full. Then a new empty sheaf must be obtained to put
more objects there.

It's possible that no free sheaves are available to use for a new
rcu_free sheaf, and the allocation in kfree_rcu() context can only use
GFP_NOWAIT and thus may fail. In that case, fall back to the existing
kfree_rcu() implementation.

Expected advantages:
- batching the kfree_rcu() operations, that could eventually replace the
  existing batching
- sheaves can be reused for allocations via barn instead of being
  flushed to slabs, which is more efficient
  - this includes cases where only some cpus are allowed to process rcu
    callbacks (Android)

Possible disadvantage:
- objects might be waiting for more than their grace period (it is
  determined by the last object freed into the sheaf), increasing memory
  usage - but the existing batching does that too.

Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
implementation favors smaller memory footprint over performance.

Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
contexts where kfree_rcu() is called might not be compatible with taking
a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
spinlock - the current kfree_rcu() implementation avoids doing that.

Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
that have them. This is not a cheap operation, but the barrier usage is
rare - currently kmem_cache_destroy() or on module unload.

Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
count how many kfree_rcu() used the rcu_free sheaf successfully and how
many had to fall back to the existing implementation.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab.h        |   3 +
 mm/slab_common.c |  26 ++++++
 mm/slub.c        | 266 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 293 insertions(+), 2 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index 206987ce44a4d053ebe3b5e50784d2dd23822cd1..e82e51c44bd00042d433ac8b46c2b4bbbdded9b1 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -435,6 +435,9 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
 	return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
 }
 
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
+void flush_all_rcu_sheaves(void);
+
 #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
 			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
 			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e2b197e47866c30acdbd1fee4159f262a751c5a7..005a4319c06a01d2b616a75396fcc43766a62ddb 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1608,6 +1608,27 @@ static void kfree_rcu_work(struct work_struct *work)
 		kvfree_rcu_list(head);
 }
 
+static bool kfree_rcu_sheaf(void *obj)
+{
+	struct kmem_cache *s;
+	struct folio *folio;
+	struct slab *slab;
+
+	if (is_vmalloc_addr(obj))
+		return false;
+
+	folio = virt_to_folio(obj);
+	if (unlikely(!folio_test_slab(folio)))
+		return false;
+
+	slab = folio_slab(folio);
+	s = slab->slab_cache;
+	if (s->cpu_sheaves)
+		return __kfree_rcu_sheaf(s, obj);
+
+	return false;
+}
+
 static bool
 need_offload_krc(struct kfree_rcu_cpu *krcp)
 {
@@ -1952,6 +1973,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
 	if (!head)
 		might_sleep();
 
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr))
+		return;
+
 	// Queue the object but don't yet schedule the batch.
 	if (debug_rcu_head_queue(ptr)) {
 		// Probable double kfree_rcu(), just leak.
@@ -2026,6 +2050,8 @@ void kvfree_rcu_barrier(void)
 	bool queued;
 	int i, cpu;
 
+	flush_all_rcu_sheaves();
+
 	/*
 	 * Firstly we detach objects and queue them over an RCU-batch
 	 * for all CPUs. Finally queued works are flushed for each CPU.
diff --git a/mm/slub.c b/mm/slub.c
index cba188b7e04ddf86debf9bc27a2f725db1b2056e..19cd8444ae5d210c77ae767912ca1ff3fc69c2a8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -367,6 +367,8 @@ enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
 	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
 	FREE_PCS,		/* Free to percpu sheaf */
+	FREE_RCU_SHEAF,		/* Free to rcu_free sheaf */
+	FREE_RCU_SHEAF_FAIL,	/* Failed to free to a rcu_free sheaf */
 	FREE_FASTPATH,		/* Free to cpu slab */
 	FREE_SLOWPATH,		/* Freeing not to cpu slab */
 	FREE_FROZEN,		/* Freeing to frozen slab */
@@ -461,6 +463,7 @@ struct slab_sheaf {
 		struct rcu_head rcu_head;
 		struct list_head barn_list;
 	};
+	struct kmem_cache *cache;
 	unsigned int size;
 	void *objects[];
 };
@@ -469,6 +472,7 @@ struct slub_percpu_sheaves {
 	local_trylock_t lock;
 	struct slab_sheaf *main; /* never NULL when unlocked */
 	struct slab_sheaf *spare; /* empty or full, may be NULL */
+	struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
 };
 
 /*
@@ -2531,6 +2535,8 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
 	if (unlikely(!sheaf))
 		return NULL;
 
+	sheaf->cache = s;
+
 	stat(s, SHEAF_ALLOC);
 
 	return sheaf;
@@ -2655,6 +2661,43 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
 	sheaf->size = 0;
 }
 
+static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
+				     struct slab_sheaf *sheaf)
+{
+	bool init = slab_want_init_on_free(s);
+	void **p = &sheaf->objects[0];
+	unsigned int i = 0;
+
+	while (i < sheaf->size) {
+		struct slab *slab = virt_to_slab(p[i]);
+
+		memcg_slab_free_hook(s, slab, p + i, 1);
+		alloc_tagging_slab_free_hook(s, slab, p + i, 1);
+
+		if (unlikely(!slab_free_hook(s, p[i], init, true))) {
+			p[i] = p[--sheaf->size];
+			continue;
+		}
+
+		i++;
+	}
+}
+
+static void rcu_free_sheaf_nobarn(struct rcu_head *head)
+{
+	struct slab_sheaf *sheaf;
+	struct kmem_cache *s;
+
+	sheaf = container_of(head, struct slab_sheaf, rcu_head);
+	s = sheaf->cache;
+
+	__rcu_free_sheaf_prepare(s, sheaf);
+
+	sheaf_flush_unused(s, sheaf);
+
+	free_empty_sheaf(s, sheaf);
+}
+
 /*
  * Caller needs to make sure migration is disabled in order to fully flush
  * single cpu's sheaves
@@ -2667,7 +2710,7 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
 static void pcs_flush_all(struct kmem_cache *s)
 {
 	struct slub_percpu_sheaves *pcs;
-	struct slab_sheaf *spare;
+	struct slab_sheaf *spare, *rcu_free;
 
 	local_lock(&s->cpu_sheaves->lock);
 	pcs = this_cpu_ptr(s->cpu_sheaves);
@@ -2675,6 +2718,9 @@ static void pcs_flush_all(struct kmem_cache *s)
 	spare = pcs->spare;
 	pcs->spare = NULL;
 
+	rcu_free = pcs->rcu_free;
+	pcs->rcu_free = NULL;
+
 	local_unlock(&s->cpu_sheaves->lock);
 
 	if (spare) {
@@ -2682,6 +2728,9 @@ static void pcs_flush_all(struct kmem_cache *s)
 		free_empty_sheaf(s, spare);
 	}
 
+	if (rcu_free)
+		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+
 	sheaf_flush_main(s);
 }
 
@@ -2698,6 +2747,11 @@ static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
 		free_empty_sheaf(s, pcs->spare);
 		pcs->spare = NULL;
 	}
+
+	if (pcs->rcu_free) {
+		call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+		pcs->rcu_free = NULL;
+	}
 }
 
 static void pcs_destroy(struct kmem_cache *s)
@@ -2723,6 +2777,7 @@ static void pcs_destroy(struct kmem_cache *s)
 		 */
 
 		WARN_ON(pcs->spare);
+		WARN_ON(pcs->rcu_free);
 
 		if (!WARN_ON(pcs->main->size)) {
 			free_empty_sheaf(s, pcs->main);
@@ -3780,7 +3835,7 @@ static bool has_pcs_used(int cpu, struct kmem_cache *s)
 
 	pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 
-	return (pcs->spare || pcs->main->size);
+	return (pcs->spare || pcs->rcu_free || pcs->main->size);
 }
 
 /*
@@ -3840,6 +3895,80 @@ static void flush_all(struct kmem_cache *s)
 	cpus_read_unlock();
 }
 
+static void flush_rcu_sheaf(struct work_struct *w)
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slab_sheaf *rcu_free;
+	struct slub_flush_work *sfw;
+	struct kmem_cache *s;
+
+	sfw = container_of(w, struct slub_flush_work, work);
+	s = sfw->s;
+
+	local_lock(&s->cpu_sheaves->lock);
+	pcs = this_cpu_ptr(s->cpu_sheaves);
+
+	rcu_free = pcs->rcu_free;
+	pcs->rcu_free = NULL;
+
+	local_unlock(&s->cpu_sheaves->lock);
+
+	if (rcu_free)
+		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+}
+
+
+/* needed for kvfree_rcu_barrier() */
+void flush_all_rcu_sheaves()
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slub_flush_work *sfw;
+	struct kmem_cache *s;
+	bool flushed = false;
+	unsigned int cpu;
+
+	cpus_read_lock();
+	mutex_lock(&slab_mutex);
+
+	list_for_each_entry(s, &slab_caches, list) {
+		if (!s->cpu_sheaves)
+			continue;
+
+		mutex_lock(&flush_lock);
+
+		for_each_online_cpu(cpu) {
+			sfw = &per_cpu(slub_flush, cpu);
+			pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
+
+			if (!pcs->rcu_free || !pcs->rcu_free->size) {
+				sfw->skip = true;
+				continue;
+			}
+
+			INIT_WORK(&sfw->work, flush_rcu_sheaf);
+			sfw->skip = false;
+			sfw->s = s;
+			queue_work_on(cpu, flushwq, &sfw->work);
+			flushed = true;
+		}
+
+		for_each_online_cpu(cpu) {
+			sfw = &per_cpu(slub_flush, cpu);
+			if (sfw->skip)
+				continue;
+			flush_work(&sfw->work);
+		}
+
+		mutex_unlock(&flush_lock);
+	}
+
+	mutex_unlock(&slab_mutex);
+	cpus_read_unlock();
+
+	if (flushed)
+		rcu_barrier();
+}
+
 /*
  * Use the cpu notifier to insure that the cpu slabs are flushed when
  * necessary.
@@ -5413,6 +5542,130 @@ bool free_to_pcs(struct kmem_cache *s, void *object)
 	return true;
 }
 
+static void rcu_free_sheaf(struct rcu_head *head)
+{
+	struct slab_sheaf *sheaf;
+	struct node_barn *barn;
+	struct kmem_cache *s;
+
+	sheaf = container_of(head, struct slab_sheaf, rcu_head);
+
+	s = sheaf->cache;
+
+	/*
+	 * This may remove some objects due to slab_free_hook() returning false,
+	 * so that the sheaf might no longer be completely full. But it's easier
+	 * to handle it as full (unless it became completely empty), as the code
+	 * handles it fine. The only downside is that sheaf will serve fewer
+	 * allocations when reused. It only happens due to debugging, which is a
+	 * performance hit anyway.
+	 */
+	__rcu_free_sheaf_prepare(s, sheaf);
+
+	barn = get_node(s, numa_mem_id())->barn;
+
+	/* due to slab_free_hook() */
+	if (unlikely(sheaf->size == 0))
+		goto empty;
+
+	/*
+	 * Checking nr_full/nr_empty outside lock avoids contention in case the
+	 * barn is at the respective limit. Due to the race we might go over the
+	 * limit but that should be rare and harmless.
+	 */
+
+	if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) {
+		stat(s, BARN_PUT);
+		barn_put_full_sheaf(barn, sheaf);
+		return;
+	}
+
+	stat(s, BARN_PUT_FAIL);
+	sheaf_flush_unused(s, sheaf);
+
+empty:
+	if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
+		barn_put_empty_sheaf(barn, sheaf);
+		return;
+	}
+
+	free_empty_sheaf(s, sheaf);
+}
+
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slab_sheaf *rcu_sheaf;
+
+	if (!local_trylock(&s->cpu_sheaves->lock))
+		goto fail;
+
+	pcs = this_cpu_ptr(s->cpu_sheaves);
+
+	if (unlikely(!pcs->rcu_free)) {
+
+		struct slab_sheaf *empty;
+		struct node_barn *barn;
+
+		if (pcs->spare && pcs->spare->size == 0) {
+			pcs->rcu_free = pcs->spare;
+			pcs->spare = NULL;
+			goto do_free;
+		}
+
+		barn = get_barn(s);
+
+		empty = barn_get_empty_sheaf(barn);
+
+		if (empty) {
+			pcs->rcu_free = empty;
+			goto do_free;
+		}
+
+		local_unlock(&s->cpu_sheaves->lock);
+
+		empty = alloc_empty_sheaf(s, GFP_NOWAIT);
+
+		if (!empty)
+			goto fail;
+
+		if (!local_trylock(&s->cpu_sheaves->lock)) {
+			barn_put_empty_sheaf(barn, empty);
+			goto fail;
+		}
+
+		pcs = this_cpu_ptr(s->cpu_sheaves);
+
+		if (unlikely(pcs->rcu_free))
+			barn_put_empty_sheaf(barn, empty);
+		else
+			pcs->rcu_free = empty;
+	}
+
+do_free:
+
+	rcu_sheaf = pcs->rcu_free;
+
+	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
+
+	if (likely(rcu_sheaf->size < s->sheaf_capacity))
+		rcu_sheaf = NULL;
+	else
+		pcs->rcu_free = NULL;
+
+	local_unlock(&s->cpu_sheaves->lock);
+
+	if (rcu_sheaf)
+		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
+
+	stat(s, FREE_RCU_SHEAF);
+	return true;
+
+fail:
+	stat(s, FREE_RCU_SHEAF_FAIL);
+	return false;
+}
+
 /*
  * Bulk free objects to the percpu sheaves.
  * Unlike free_to_pcs() this includes the calls to all necessary hooks
@@ -6909,6 +7162,11 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 	struct kmem_cache_node *n;
 
 	flush_all_cpus_locked(s);
+
+	/* we might have rcu sheaves in flight */
+	if (s->cpu_sheaves)
+		rcu_barrier();
+
 	/* Attempt to free all objects */
 	for_each_kmem_cache_node(s, node, n) {
 		if (n->barn)
@@ -8284,6 +8542,8 @@ STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf);
 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
 STAT_ATTR(FREE_PCS, free_cpu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail);
 STAT_ATTR(FREE_FASTPATH, free_fastpath);
 STAT_ATTR(FREE_SLOWPATH, free_slowpath);
 STAT_ATTR(FREE_FROZEN, free_frozen);
@@ -8382,6 +8642,8 @@ static struct attribute *slab_attrs[] = {
 	&alloc_fastpath_attr.attr,
 	&alloc_slowpath_attr.attr,
 	&free_cpu_sheaf_attr.attr,
+	&free_rcu_sheaf_attr.attr,
+	&free_rcu_sheaf_fail_attr.attr,
 	&free_fastpath_attr.attr,
 	&free_slowpath_attr.attr,
 	&free_frozen_attr.attr,

-- 
2.51.0

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Harry Yoo 2 weeks, 1 day ago

On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> addition to main and spare sheaves.
> 
> kfree_rcu() operations will try to put objects on this sheaf. Once full,
> the sheaf is detached and submitted to call_rcu() with a handler that
> will try to put it in the barn, or flush to slab pages using bulk free,
> when the barn is full. Then a new empty sheaf must be obtained to put
> more objects there.
> 
> It's possible that no free sheaves are available to use for a new
> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> kfree_rcu() implementation.
> 
> Expected advantages:
> - batching the kfree_rcu() operations, that could eventually replace the
>   existing batching
> - sheaves can be reused for allocations via barn instead of being
>   flushed to slabs, which is more efficient
>   - this includes cases where only some cpus are allowed to process rcu
>     callbacks (Android)
> 
> Possible disadvantage:
> - objects might be waiting for more than their grace period (it is
>   determined by the last object freed into the sheaf), increasing memory
>   usage - but the existing batching does that too.
> 
> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> implementation favors smaller memory footprint over performance.
> 
> Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
> contexts where kfree_rcu() is called might not be compatible with taking
> a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
> spinlock - the current kfree_rcu() implementation avoids doing that.
> 
> Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
> that have them. This is not a cheap operation, but the barrier usage is
> rare - currently kmem_cache_destroy() or on module unload.
> 
> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> count how many kfree_rcu() used the rcu_free sheaf successfully and how
> many had to fall back to the existing implementation.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/slab.h        |   3 +
>  mm/slab_common.c |  26 ++++++
>  mm/slub.c        | 266 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 293 insertions(+), 2 deletions(-)
> 
> @@ -3840,6 +3895,80 @@ static void flush_all(struct kmem_cache *s)
>  	cpus_read_unlock();
>  }
>  
> +/* needed for kvfree_rcu_barrier() */
> +void flush_all_rcu_sheaves()
> +{
> +	struct slub_percpu_sheaves *pcs;
> +	struct slub_flush_work *sfw;
> +	struct kmem_cache *s;
> +	bool flushed = false;
> +	unsigned int cpu;
> +
> +	cpus_read_lock();
> +	mutex_lock(&slab_mutex);
> +
> +	list_for_each_entry(s, &slab_caches, list) {
> +		if (!s->cpu_sheaves)
> +			continue;
> +
> +		mutex_lock(&flush_lock);
> +
> +		for_each_online_cpu(cpu) {
> +			sfw = &per_cpu(slub_flush, cpu);
> +			pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
> +
> +			if (!pcs->rcu_free || !pcs->rcu_free->size) {

Is the compiler allowed to compile this to read pcs->rcu_free twice?
Something like:

flush_all_rcu_sheaves()			__kfree_rcu_sheaf()

pcs->rcu_free != NULL
					pcs->rcu_free = NULL
pcs->rcu_free == NULL
/* NULL-pointer-deref */
pcs->rcu_free->size

> +				sfw->skip = true;
> +				continue;
> +			}
>
> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> +			sfw->skip = false;
> +			sfw->s = s;
> +			queue_work_on(cpu, flushwq, &sfw->work);
> +			flushed = true;
> +		}
> +
> +		for_each_online_cpu(cpu) {
> +			sfw = &per_cpu(slub_flush, cpu);
> +			if (sfw->skip)
> +				continue;
> +			flush_work(&sfw->work);
> +		}
> +
> +		mutex_unlock(&flush_lock);
> +	}
> +
> +	mutex_unlock(&slab_mutex);
> +	cpus_read_unlock();
> +
> +	if (flushed)
> +		rcu_barrier();

I think we need to call rcu_barrier() even if flushed == false?

Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
be processed before flush_all_rcu_sheaves() is called, and
in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
so flushed == false but the rcu callback isn't processed yet
by the end of the function?

That sounds like a very unlikely to happen in a realistic scenario,
but still possible...

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Vlastimil Babka 2 weeks, 1 day ago

On 9/17/25 10:30, Harry Yoo wrote:
> On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>> +/* needed for kvfree_rcu_barrier() */
>> +void flush_all_rcu_sheaves()
>> +{
>> +	struct slub_percpu_sheaves *pcs;
>> +	struct slub_flush_work *sfw;
>> +	struct kmem_cache *s;
>> +	bool flushed = false;
>> +	unsigned int cpu;
>> +
>> +	cpus_read_lock();
>> +	mutex_lock(&slab_mutex);
>> +
>> +	list_for_each_entry(s, &slab_caches, list) {
>> +		if (!s->cpu_sheaves)
>> +			continue;
>> +
>> +		mutex_lock(&flush_lock);
>> +
>> +		for_each_online_cpu(cpu) {
>> +			sfw = &per_cpu(slub_flush, cpu);
>> +			pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
>> +
>> +			if (!pcs->rcu_free || !pcs->rcu_free->size) {
> 
> Is the compiler allowed to compile this to read pcs->rcu_free twice?
> Something like:
> 
> flush_all_rcu_sheaves()			__kfree_rcu_sheaf()
> 
> pcs->rcu_free != NULL
> 					pcs->rcu_free = NULL
> pcs->rcu_free == NULL
> /* NULL-pointer-deref */
> pcs->rcu_free->size

Good point, I'll remove the size check and simply pcs->rcu_free non-null
means we flush.

>> +				sfw->skip = true;
>> +				continue;
>> +			}
>>
>> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>> +			sfw->skip = false;
>> +			sfw->s = s;
>> +			queue_work_on(cpu, flushwq, &sfw->work);
>> +			flushed = true;
>> +		}
>> +
>> +		for_each_online_cpu(cpu) {
>> +			sfw = &per_cpu(slub_flush, cpu);
>> +			if (sfw->skip)
>> +				continue;
>> +			flush_work(&sfw->work);
>> +		}
>> +
>> +		mutex_unlock(&flush_lock);
>> +	}
>> +
>> +	mutex_unlock(&slab_mutex);
>> +	cpus_read_unlock();
>> +
>> +	if (flushed)
>> +		rcu_barrier();
> 
> I think we need to call rcu_barrier() even if flushed == false?
> 
> Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> be processed before flush_all_rcu_sheaves() is called, and
> in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> so flushed == false but the rcu callback isn't processed yet
> by the end of the function?
> 
> That sounds like a very unlikely to happen in a realistic scenario,
> but still possible...

Yes also good point, will flush unconditionally.

Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
local_unlock(). So we don't end up seeing a NULL pcs->rcu_free in
flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
but didn't yet do the call_rcu() as it got preempted after local_unlock().

But then rcu_barrier() itself probably won't mean we make sure such cpus
finished the local_locked section, if we didn't queue work on them. So maybe
we need synchronize_rcu()?

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Paul E. McKenney 2 weeks, 1 day ago

On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
> On 9/17/25 10:30, Harry Yoo wrote:
> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> >> +/* needed for kvfree_rcu_barrier() */
> >> +void flush_all_rcu_sheaves()
> >> +{
> >> +	struct slub_percpu_sheaves *pcs;
> >> +	struct slub_flush_work *sfw;
> >> +	struct kmem_cache *s;
> >> +	bool flushed = false;
> >> +	unsigned int cpu;
> >> +
> >> +	cpus_read_lock();
> >> +	mutex_lock(&slab_mutex);
> >> +
> >> +	list_for_each_entry(s, &slab_caches, list) {
> >> +		if (!s->cpu_sheaves)
> >> +			continue;
> >> +
> >> +		mutex_lock(&flush_lock);
> >> +
> >> +		for_each_online_cpu(cpu) {
> >> +			sfw = &per_cpu(slub_flush, cpu);
> >> +			pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
> >> +
> >> +			if (!pcs->rcu_free || !pcs->rcu_free->size) {
> > 
> > Is the compiler allowed to compile this to read pcs->rcu_free twice?
> > Something like:
> > 
> > flush_all_rcu_sheaves()			__kfree_rcu_sheaf()
> > 
> > pcs->rcu_free != NULL
> > 					pcs->rcu_free = NULL
> > pcs->rcu_free == NULL
> > /* NULL-pointer-deref */
> > pcs->rcu_free->size
> 
> Good point, I'll remove the size check and simply pcs->rcu_free non-null
> means we flush.
> 
> >> +				sfw->skip = true;
> >> +				continue;
> >> +			}
> >>
> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> >> +			sfw->skip = false;
> >> +			sfw->s = s;
> >> +			queue_work_on(cpu, flushwq, &sfw->work);
> >> +			flushed = true;
> >> +		}
> >> +
> >> +		for_each_online_cpu(cpu) {
> >> +			sfw = &per_cpu(slub_flush, cpu);
> >> +			if (sfw->skip)
> >> +				continue;
> >> +			flush_work(&sfw->work);
> >> +		}
> >> +
> >> +		mutex_unlock(&flush_lock);
> >> +	}
> >> +
> >> +	mutex_unlock(&slab_mutex);
> >> +	cpus_read_unlock();
> >> +
> >> +	if (flushed)
> >> +		rcu_barrier();
> > 
> > I think we need to call rcu_barrier() even if flushed == false?
> > 
> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> > be processed before flush_all_rcu_sheaves() is called, and
> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> > so flushed == false but the rcu callback isn't processed yet
> > by the end of the function?
> > 
> > That sounds like a very unlikely to happen in a realistic scenario,
> > but still possible...
> 
> Yes also good point, will flush unconditionally.
> 
> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
> local_unlock(). So we don't end up seeing a NULL pcs->rcu_free in
> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
> but didn't yet do the call_rcu() as it got preempted after local_unlock().
> 
> But then rcu_barrier() itself probably won't mean we make sure such cpus
> finished the local_locked section, if we didn't queue work on them. So maybe
> we need synchronize_rcu()?

Do you need both rcu_barrier() and synchronize_rcu(), maybe along with
kvfree_rcu_barrier() as well?  It would not be hard to make such a thing,
using workqueues or some such.  Not sure what the API should look like,
especially should people want other RCU flavors to get into the act
as well.

							Thanx, Paul

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Vlastimil Babka 2 weeks, 1 day ago

On 9/17/25 13:36, Paul E. McKenney wrote:
> On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
>> On 9/17/25 10:30, Harry Yoo wrote:
>> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>> >> +/* needed for kvfree_rcu_barrier() */
>> >> +void flush_all_rcu_sheaves()
>> >> +{
>> >> +	struct slub_percpu_sheaves *pcs;
>> >> +	struct slub_flush_work *sfw;
>> >> +	struct kmem_cache *s;
>> >> +	bool flushed = false;
>> >> +	unsigned int cpu;
>> >> +
>> >> +	cpus_read_lock();
>> >> +	mutex_lock(&slab_mutex);
>> >> +
>> >> +	list_for_each_entry(s, &slab_caches, list) {
>> >> +		if (!s->cpu_sheaves)
>> >> +			continue;
>> >> +
>> >> +		mutex_lock(&flush_lock);
>> >> +
>> >> +		for_each_online_cpu(cpu) {
>> >> +			sfw = &per_cpu(slub_flush, cpu);
>> >> +			pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
>> >> +
>> >> +			if (!pcs->rcu_free || !pcs->rcu_free->size) {
>> > 
>> > Is the compiler allowed to compile this to read pcs->rcu_free twice?
>> > Something like:
>> > 
>> > flush_all_rcu_sheaves()			__kfree_rcu_sheaf()
>> > 
>> > pcs->rcu_free != NULL
>> > 					pcs->rcu_free = NULL
>> > pcs->rcu_free == NULL
>> > /* NULL-pointer-deref */
>> > pcs->rcu_free->size
>> 
>> Good point, I'll remove the size check and simply pcs->rcu_free non-null
>> means we flush.
>> 
>> >> +				sfw->skip = true;
>> >> +				continue;
>> >> +			}
>> >>
>> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>> >> +			sfw->skip = false;
>> >> +			sfw->s = s;
>> >> +			queue_work_on(cpu, flushwq, &sfw->work);
>> >> +			flushed = true;
>> >> +		}
>> >> +
>> >> +		for_each_online_cpu(cpu) {
>> >> +			sfw = &per_cpu(slub_flush, cpu);
>> >> +			if (sfw->skip)
>> >> +				continue;
>> >> +			flush_work(&sfw->work);
>> >> +		}
>> >> +
>> >> +		mutex_unlock(&flush_lock);
>> >> +	}
>> >> +
>> >> +	mutex_unlock(&slab_mutex);
>> >> +	cpus_read_unlock();
>> >> +
>> >> +	if (flushed)
>> >> +		rcu_barrier();
>> > 
>> > I think we need to call rcu_barrier() even if flushed == false?
>> > 
>> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
>> > be processed before flush_all_rcu_sheaves() is called, and
>> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
>> > so flushed == false but the rcu callback isn't processed yet
>> > by the end of the function?
>> > 
>> > That sounds like a very unlikely to happen in a realistic scenario,
>> > but still possible...
>> 
>> Yes also good point, will flush unconditionally.
>> 
>> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
>> local_unlock(). So we don't end up seeing a NULL pcs->rcu_free in
>> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
>> but didn't yet do the call_rcu() as it got preempted after local_unlock().
>> 
>> But then rcu_barrier() itself probably won't mean we make sure such cpus
>> finished the local_locked section, if we didn't queue work on them. So maybe
>> we need synchronize_rcu()?
> 
> Do you need both rcu_barrier() and synchronize_rcu(), maybe along with

We need the local_lock protected sections of __kfree_rcu_sheaf() to be
finished (which might be doing call_rcu(rcu_sheaf) And then the pending
call_rcu(rcu_sheaf) callbacks to be executed.
I think that means both synchronize_rcu() and rcu_barrier(). Possibly a RCU
critical section in __kfree_rcu_sheaf() unless local_lock implies that even
on RT.

> kvfree_rcu_barrier() as well?

This (flush_all_rcu_sheaves()) is for the implementation of
kvfree_rcu_barrier() in a world with rcu_free sheaves. So it's called from
kvfree_rcu_barrier().

> It would not be hard to make such a thing,
> using workqueues or some such.  Not sure what the API should look like,

So there should be no one else calling such an API. There might be new users
of kvfree_rcu_barrier() doing this indirectly in the future.

> especially should people want other RCU flavors to get into the act
> as well.
> 
> 							Thanx, Paul

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Harry Yoo 2 weeks, 1 day ago

On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
> On 9/17/25 10:30, Harry Yoo wrote:
> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> >> +				sfw->skip = true;
> >> +				continue;
> >> +			}
> >>
> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> >> +			sfw->skip = false;
> >> +			sfw->s = s;
> >> +			queue_work_on(cpu, flushwq, &sfw->work);
> >> +			flushed = true;
> >> +		}
> >> +
> >> +		for_each_online_cpu(cpu) {
> >> +			sfw = &per_cpu(slub_flush, cpu);
> >> +			if (sfw->skip)
> >> +				continue;
> >> +			flush_work(&sfw->work);
> >> +		}
> >> +
> >> +		mutex_unlock(&flush_lock);
> >> +	}
> >> +
> >> +	mutex_unlock(&slab_mutex);
> >> +	cpus_read_unlock();
> >> +
> >> +	if (flushed)
> >> +		rcu_barrier();
> > 
> > I think we need to call rcu_barrier() even if flushed == false?
> > 
> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> > be processed before flush_all_rcu_sheaves() is called, and
> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> > so flushed == false but the rcu callback isn't processed yet
> > by the end of the function?
> > 
> > That sounds like a very unlikely to happen in a realistic scenario,
> > but still possible...
> 
> Yes also good point, will flush unconditionally.
> 
> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
> local_unlock(). So we don't end up seeing a NULL pcs->rcu_free in
> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
> but didn't yet do the call_rcu() as it got preempted after local_unlock().

Makes sense to me.

> But then rcu_barrier() itself probably won't mean we make sure such cpus
> finished the local_locked section, if we didn't queue work on them. So maybe
> we need synchronize_rcu()?

Ah, it works because preemption disabled section works as a RCU
read-side critical section?

But then are we allowed to do release the local_lock to allocate empty
sheaves in __kfree_rcu_sheaf()?

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Vlastimil Babka 2 weeks, 1 day ago

On 9/17/25 13:32, Harry Yoo wrote:
> On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
>> On 9/17/25 10:30, Harry Yoo wrote:
>> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>> >> +				sfw->skip = true;
>> >> +				continue;
>> >> +			}
>> >>
>> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>> >> +			sfw->skip = false;
>> >> +			sfw->s = s;
>> >> +			queue_work_on(cpu, flushwq, &sfw->work);
>> >> +			flushed = true;
>> >> +		}
>> >> +
>> >> +		for_each_online_cpu(cpu) {
>> >> +			sfw = &per_cpu(slub_flush, cpu);
>> >> +			if (sfw->skip)
>> >> +				continue;
>> >> +			flush_work(&sfw->work);
>> >> +		}
>> >> +
>> >> +		mutex_unlock(&flush_lock);
>> >> +	}
>> >> +
>> >> +	mutex_unlock(&slab_mutex);
>> >> +	cpus_read_unlock();
>> >> +
>> >> +	if (flushed)
>> >> +		rcu_barrier();
>> > 
>> > I think we need to call rcu_barrier() even if flushed == false?
>> > 
>> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
>> > be processed before flush_all_rcu_sheaves() is called, and
>> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
>> > so flushed == false but the rcu callback isn't processed yet
>> > by the end of the function?
>> > 
>> > That sounds like a very unlikely to happen in a realistic scenario,
>> > but still possible...
>> 
>> Yes also good point, will flush unconditionally.
>> 
>> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
>> local_unlock(). So we don't end up seeing a NULL pcs->rcu_free in
>> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
>> but didn't yet do the call_rcu() as it got preempted after local_unlock().
> 
> Makes sense to me.
> 
>> But then rcu_barrier() itself probably won't mean we make sure such cpus
>> finished the local_locked section, if we didn't queue work on them. So maybe
>> we need synchronize_rcu()?
> 
> Ah, it works because preemption disabled section works as a RCU
> read-side critical section?

AFAIK yes? Or maybe not on RT where local_lock is taking a mutex? So we
should denote the RCU critical section explicitly too?

> But then are we allowed to do release the local_lock to allocate empty
> sheaves in __kfree_rcu_sheaf()?

I think so, as we do that when we found no rcu_sheaf in the first place.

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Harry Yoo 2 weeks, 1 day ago

On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
> On 9/17/25 13:32, Harry Yoo wrote:
> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
> >> On 9/17/25 10:30, Harry Yoo wrote:
> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> >> >> +				sfw->skip = true;
> >> >> +				continue;
> >> >> +			}
> >> >>
> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> >> >> +			sfw->skip = false;
> >> >> +			sfw->s = s;
> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
> >> >> +			flushed = true;
> >> >> +		}
> >> >> +
> >> >> +		for_each_online_cpu(cpu) {
> >> >> +			sfw = &per_cpu(slub_flush, cpu);
> >> >> +			if (sfw->skip)
> >> >> +				continue;
> >> >> +			flush_work(&sfw->work);
> >> >> +		}
> >> >> +
> >> >> +		mutex_unlock(&flush_lock);
> >> >> +	}
> >> >> +
> >> >> +	mutex_unlock(&slab_mutex);
> >> >> +	cpus_read_unlock();
> >> >> +
> >> >> +	if (flushed)
> >> >> +		rcu_barrier();
> >> > 
> >> > I think we need to call rcu_barrier() even if flushed == false?
> >> > 
> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> >> > be processed before flush_all_rcu_sheaves() is called, and
> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> >> > so flushed == false but the rcu callback isn't processed yet
> >> > by the end of the function?
> >> > 
> >> > That sounds like a very unlikely to happen in a realistic scenario,
> >> > but still possible...
> >> 
> >> Yes also good point, will flush unconditionally.
> >> 
> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
> >> local_unlock().
> >>
> >> So we don't end up seeing a NULL pcs->rcu_free in
> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
> > 
> > Makes sense to me.

Wait, I'm confused.

I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
the object X to be freed before kvfree_rcu_barrier() returns?

IOW if flush_all_rcu_sheaves() is called while __kfree_rcu_sheaf(s, X) was
running on another CPU, we don't have to guarantee that
flush_all_rcu_sheaves() returns after the object X is freed?

> >> But then rcu_barrier() itself probably won't mean we make sure such cpus
> >> finished the local_locked section, if we didn't queue work on them. So maybe
> >> we need synchronize_rcu()?

So... we don't need a synchronize_rcu() then?

Or my brain started malfunctioning again :D

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Vlastimil Babka 2 weeks, 1 day ago

On 9/17/25 15:07, Harry Yoo wrote:
> On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
>> On 9/17/25 13:32, Harry Yoo wrote:
>> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
>> >> On 9/17/25 10:30, Harry Yoo wrote:
>> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>> >> >> +				sfw->skip = true;
>> >> >> +				continue;
>> >> >> +			}
>> >> >>
>> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>> >> >> +			sfw->skip = false;
>> >> >> +			sfw->s = s;
>> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
>> >> >> +			flushed = true;
>> >> >> +		}
>> >> >> +
>> >> >> +		for_each_online_cpu(cpu) {
>> >> >> +			sfw = &per_cpu(slub_flush, cpu);
>> >> >> +			if (sfw->skip)
>> >> >> +				continue;
>> >> >> +			flush_work(&sfw->work);
>> >> >> +		}
>> >> >> +
>> >> >> +		mutex_unlock(&flush_lock);
>> >> >> +	}
>> >> >> +
>> >> >> +	mutex_unlock(&slab_mutex);
>> >> >> +	cpus_read_unlock();
>> >> >> +
>> >> >> +	if (flushed)
>> >> >> +		rcu_barrier();
>> >> > 
>> >> > I think we need to call rcu_barrier() even if flushed == false?
>> >> > 
>> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
>> >> > be processed before flush_all_rcu_sheaves() is called, and
>> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
>> >> > so flushed == false but the rcu callback isn't processed yet
>> >> > by the end of the function?
>> >> > 
>> >> > That sounds like a very unlikely to happen in a realistic scenario,
>> >> > but still possible...
>> >> 
>> >> Yes also good point, will flush unconditionally.
>> >> 
>> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
>> >> local_unlock().
>> >>
>> >> So we don't end up seeing a NULL pcs->rcu_free in
>> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
>> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
>> > 
>> > Makes sense to me.
> 
> Wait, I'm confused.
> 
> I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
> only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
> the object X to be freed before kvfree_rcu_barrier() returns?

Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
the rcu_sheaf fully and thus without submitting it to call_rcu(), then
migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
up the rcu_sheaf fully and is about to call_rcu() on it. And since that
sheaf also contains the object X, we should make sure that is flushed.

> IOW if flush_all_rcu_sheaves() is called while __kfree_rcu_sheaf(s, X) was
> running on another CPU, we don't have to guarantee that
> flush_all_rcu_sheaves() returns after the object X is freed?
> 
>> >> But then rcu_barrier() itself probably won't mean we make sure such cpus
>> >> finished the local_locked section, if we didn't queue work on them. So maybe
>> >> we need synchronize_rcu()?
> 
> So... we don't need a synchronize_rcu() then?
> 
> Or my brain started malfunctioning again :D
>

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Harry Yoo 2 weeks, 1 day ago

On Wed, Sep 17, 2025 at 03:21:31PM +0200, Vlastimil Babka wrote:
> On 9/17/25 15:07, Harry Yoo wrote:
> > On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
> >> On 9/17/25 13:32, Harry Yoo wrote:
> >> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
> >> >> On 9/17/25 10:30, Harry Yoo wrote:
> >> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> >> >> >> +				sfw->skip = true;
> >> >> >> +				continue;
> >> >> >> +			}
> >> >> >>
> >> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> >> >> >> +			sfw->skip = false;
> >> >> >> +			sfw->s = s;
> >> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
> >> >> >> +			flushed = true;
> >> >> >> +		}
> >> >> >> +
> >> >> >> +		for_each_online_cpu(cpu) {
> >> >> >> +			sfw = &per_cpu(slub_flush, cpu);
> >> >> >> +			if (sfw->skip)
> >> >> >> +				continue;
> >> >> >> +			flush_work(&sfw->work);
> >> >> >> +		}
> >> >> >> +
> >> >> >> +		mutex_unlock(&flush_lock);
> >> >> >> +	}
> >> >> >> +
> >> >> >> +	mutex_unlock(&slab_mutex);
> >> >> >> +	cpus_read_unlock();
> >> >> >> +
> >> >> >> +	if (flushed)
> >> >> >> +		rcu_barrier();
> >> >> > 
> >> >> > I think we need to call rcu_barrier() even if flushed == false?
> >> >> > 
> >> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> >> >> > be processed before flush_all_rcu_sheaves() is called, and
> >> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> >> >> > so flushed == false but the rcu callback isn't processed yet
> >> >> > by the end of the function?
> >> >> > 
> >> >> > That sounds like a very unlikely to happen in a realistic scenario,
> >> >> > but still possible...
> >> >> 
> >> >> Yes also good point, will flush unconditionally.
> >> >> 
> >> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
> >> >> local_unlock().
> >> >>
> >> >> So we don't end up seeing a NULL pcs->rcu_free in
> >> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
> >> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
> >> > 
> >> > Makes sense to me.
> > 
> > Wait, I'm confused.
> > 
> > I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
> > only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
> > the object X to be freed before kvfree_rcu_barrier() returns?
> 
> Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
> up the rcu_sheaf fully and is about to call_rcu() on it. And since that
> sheaf also contains the object X, we should make sure that is flushed.

I was going to say "but we queue and wait for the flushing work to
complete, so the sheaf containing object X should be flushed?"

But nah, that's true only if we see pcs->rcu_free != NULL in
flush_all_rcu_sheaves().

You are right...

Hmm, maybe it's simpler to fix this by never skipping queueing the work
even when pcs->rcu_sheaf == NULL?

> > IOW if flush_all_rcu_sheaves() is called while __kfree_rcu_sheaf(s, X) was
> > running on another CPU, we don't have to guarantee that
> > flush_all_rcu_sheaves() returns after the object X is freed?
> > 
> >> >> But then rcu_barrier() itself probably won't mean we make sure such cpus
> >> >> finished the local_locked section, if we didn't queue work on them. So maybe
> >> >> we need synchronize_rcu()?
> > 
> > So... we don't need a synchronize_rcu() then?
> > 
> > Or my brain started malfunctioning again :D

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Vlastimil Babka 2 weeks, 1 day ago

On 9/17/25 15:34, Harry Yoo wrote:
> On Wed, Sep 17, 2025 at 03:21:31PM +0200, Vlastimil Babka wrote:
>> On 9/17/25 15:07, Harry Yoo wrote:
>> > On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
>> >> On 9/17/25 13:32, Harry Yoo wrote:
>> >> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
>> >> >> On 9/17/25 10:30, Harry Yoo wrote:
>> >> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>> >> >> >> +				sfw->skip = true;
>> >> >> >> +				continue;
>> >> >> >> +			}
>> >> >> >>
>> >> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>> >> >> >> +			sfw->skip = false;
>> >> >> >> +			sfw->s = s;
>> >> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
>> >> >> >> +			flushed = true;
>> >> >> >> +		}
>> >> >> >> +
>> >> >> >> +		for_each_online_cpu(cpu) {
>> >> >> >> +			sfw = &per_cpu(slub_flush, cpu);
>> >> >> >> +			if (sfw->skip)
>> >> >> >> +				continue;
>> >> >> >> +			flush_work(&sfw->work);
>> >> >> >> +		}
>> >> >> >> +
>> >> >> >> +		mutex_unlock(&flush_lock);
>> >> >> >> +	}
>> >> >> >> +
>> >> >> >> +	mutex_unlock(&slab_mutex);
>> >> >> >> +	cpus_read_unlock();
>> >> >> >> +
>> >> >> >> +	if (flushed)
>> >> >> >> +		rcu_barrier();
>> >> >> > 
>> >> >> > I think we need to call rcu_barrier() even if flushed == false?
>> >> >> > 
>> >> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
>> >> >> > be processed before flush_all_rcu_sheaves() is called, and
>> >> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
>> >> >> > so flushed == false but the rcu callback isn't processed yet
>> >> >> > by the end of the function?
>> >> >> > 
>> >> >> > That sounds like a very unlikely to happen in a realistic scenario,
>> >> >> > but still possible...
>> >> >> 
>> >> >> Yes also good point, will flush unconditionally.
>> >> >> 
>> >> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
>> >> >> local_unlock().
>> >> >>
>> >> >> So we don't end up seeing a NULL pcs->rcu_free in
>> >> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
>> >> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
>> >> > 
>> >> > Makes sense to me.
>> > 
>> > Wait, I'm confused.
>> > 
>> > I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
>> > only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
>> > the object X to be freed before kvfree_rcu_barrier() returns?
>> 
>> Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
>> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
>> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
>> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
>> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
>> up the rcu_sheaf fully and is about to call_rcu() on it. And since that
>> sheaf also contains the object X, we should make sure that is flushed.
> 
> I was going to say "but we queue and wait for the flushing work to
> complete, so the sheaf containing object X should be flushed?"
> 
> But nah, that's true only if we see pcs->rcu_free != NULL in
> flush_all_rcu_sheaves().
> 
> You are right...
> 
> Hmm, maybe it's simpler to fix this by never skipping queueing the work
> even when pcs->rcu_sheaf == NULL?

I guess it's simpler, yeah.
We might have to think of something better once all caches have sheaves,
queueing and waiting for work to finish on each cpu, repeated for each
kmem_cache, might be just too much?

>> > IOW if flush_all_rcu_sheaves() is called while __kfree_rcu_sheaf(s, X) was
>> > running on another CPU, we don't have to guarantee that
>> > flush_all_rcu_sheaves() returns after the object X is freed?
>> > 
>> >> >> But then rcu_barrier() itself probably won't mean we make sure such cpus
>> >> >> finished the local_locked section, if we didn't queue work on them. So maybe
>> >> >> we need synchronize_rcu()?
>> > 
>> > So... we don't need a synchronize_rcu() then?
>> > 
>> > Or my brain started malfunctioning again :D
>

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Vlastimil Babka 2 weeks ago

On 9/17/25 16:14, Vlastimil Babka wrote:
> On 9/17/25 15:34, Harry Yoo wrote:
>> On Wed, Sep 17, 2025 at 03:21:31PM +0200, Vlastimil Babka wrote:
>>> On 9/17/25 15:07, Harry Yoo wrote:
>>> > On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
>>> >> On 9/17/25 13:32, Harry Yoo wrote:
>>> >> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
>>> >> >> On 9/17/25 10:30, Harry Yoo wrote:
>>> >> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>>> >> >> >> +				sfw->skip = true;
>>> >> >> >> +				continue;
>>> >> >> >> +			}
>>> >> >> >>
>>> >> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>>> >> >> >> +			sfw->skip = false;
>>> >> >> >> +			sfw->s = s;
>>> >> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
>>> >> >> >> +			flushed = true;
>>> >> >> >> +		}
>>> >> >> >> +
>>> >> >> >> +		for_each_online_cpu(cpu) {
>>> >> >> >> +			sfw = &per_cpu(slub_flush, cpu);
>>> >> >> >> +			if (sfw->skip)
>>> >> >> >> +				continue;
>>> >> >> >> +			flush_work(&sfw->work);
>>> >> >> >> +		}
>>> >> >> >> +
>>> >> >> >> +		mutex_unlock(&flush_lock);
>>> >> >> >> +	}
>>> >> >> >> +
>>> >> >> >> +	mutex_unlock(&slab_mutex);
>>> >> >> >> +	cpus_read_unlock();
>>> >> >> >> +
>>> >> >> >> +	if (flushed)
>>> >> >> >> +		rcu_barrier();
>>> >> >> > 
>>> >> >> > I think we need to call rcu_barrier() even if flushed == false?
>>> >> >> > 
>>> >> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
>>> >> >> > be processed before flush_all_rcu_sheaves() is called, and
>>> >> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
>>> >> >> > so flushed == false but the rcu callback isn't processed yet
>>> >> >> > by the end of the function?
>>> >> >> > 
>>> >> >> > That sounds like a very unlikely to happen in a realistic scenario,
>>> >> >> > but still possible...
>>> >> >> 
>>> >> >> Yes also good point, will flush unconditionally.
>>> >> >> 
>>> >> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
>>> >> >> local_unlock().
>>> >> >>
>>> >> >> So we don't end up seeing a NULL pcs->rcu_free in
>>> >> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
>>> >> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
>>> >> > 
>>> >> > Makes sense to me.
>>> > 
>>> > Wait, I'm confused.
>>> > 
>>> > I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
>>> > only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
>>> > the object X to be freed before kvfree_rcu_barrier() returns?
>>> 
>>> Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
>>> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
>>> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
>>> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
>>> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
>>> up the rcu_sheaf fully and is about to call_rcu() on it. And since that
>>> sheaf also contains the object X, we should make sure that is flushed.
>> 
>> I was going to say "but we queue and wait for the flushing work to
>> complete, so the sheaf containing object X should be flushed?"
>> 
>> But nah, that's true only if we see pcs->rcu_free != NULL in
>> flush_all_rcu_sheaves().
>> 
>> You are right...
>> 
>> Hmm, maybe it's simpler to fix this by never skipping queueing the work
>> even when pcs->rcu_sheaf == NULL?
> 
> I guess it's simpler, yeah.

So what about this? The unconditional queueing should cover all races with
__kfree_rcu_sheaf() so there's just unconditional rcu_barrier() in the end.

From 0722b29fa1625b31c05d659d1d988ec882247b38 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 3 Sep 2025 14:59:46 +0200
Subject: [PATCH] slab: add sheaf support for batching kfree_rcu() operations

Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
For caches with sheaves, on each cpu maintain a rcu_free sheaf in
addition to main and spare sheaves.

kfree_rcu() operations will try to put objects on this sheaf. Once full,
the sheaf is detached and submitted to call_rcu() with a handler that
will try to put it in the barn, or flush to slab pages using bulk free,
when the barn is full. Then a new empty sheaf must be obtained to put
more objects there.

It's possible that no free sheaves are available to use for a new
rcu_free sheaf, and the allocation in kfree_rcu() context can only use
GFP_NOWAIT and thus may fail. In that case, fall back to the existing
kfree_rcu() implementation.

Expected advantages:
- batching the kfree_rcu() operations, that could eventually replace the
  existing batching
- sheaves can be reused for allocations via barn instead of being
  flushed to slabs, which is more efficient
  - this includes cases where only some cpus are allowed to process rcu
    callbacks (Android)

Possible disadvantage:
- objects might be waiting for more than their grace period (it is
  determined by the last object freed into the sheaf), increasing memory
  usage - but the existing batching does that too.

Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
implementation favors smaller memory footprint over performance.

Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
contexts where kfree_rcu() is called might not be compatible with taking
a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
spinlock - the current kfree_rcu() implementation avoids doing that.

Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
that have them. This is not a cheap operation, but the barrier usage is
rare - currently kmem_cache_destroy() or on module unload.

Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
count how many kfree_rcu() used the rcu_free sheaf successfully and how
many had to fall back to the existing implementation.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab.h        |   3 +
 mm/slab_common.c |  26 +++++
 mm/slub.c        | 267 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 294 insertions(+), 2 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index 206987ce44a4..e82e51c44bd0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -435,6 +435,9 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
 	return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
 }
 
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
+void flush_all_rcu_sheaves(void);
+
 #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
 			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
 			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e2b197e47866..005a4319c06a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1608,6 +1608,27 @@ static void kfree_rcu_work(struct work_struct *work)
 		kvfree_rcu_list(head);
 }
 
+static bool kfree_rcu_sheaf(void *obj)
+{
+	struct kmem_cache *s;
+	struct folio *folio;
+	struct slab *slab;
+
+	if (is_vmalloc_addr(obj))
+		return false;
+
+	folio = virt_to_folio(obj);
+	if (unlikely(!folio_test_slab(folio)))
+		return false;
+
+	slab = folio_slab(folio);
+	s = slab->slab_cache;
+	if (s->cpu_sheaves)
+		return __kfree_rcu_sheaf(s, obj);
+
+	return false;
+}
+
 static bool
 need_offload_krc(struct kfree_rcu_cpu *krcp)
 {
@@ -1952,6 +1973,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
 	if (!head)
 		might_sleep();
 
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr))
+		return;
+
 	// Queue the object but don't yet schedule the batch.
 	if (debug_rcu_head_queue(ptr)) {
 		// Probable double kfree_rcu(), just leak.
@@ -2026,6 +2050,8 @@ void kvfree_rcu_barrier(void)
 	bool queued;
 	int i, cpu;
 
+	flush_all_rcu_sheaves();
+
 	/*
 	 * Firstly we detach objects and queue them over an RCU-batch
 	 * for all CPUs. Finally queued works are flushed for each CPU.
diff --git a/mm/slub.c b/mm/slub.c
index cba188b7e04d..171273f90efd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -367,6 +367,8 @@ enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
 	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
 	FREE_PCS,		/* Free to percpu sheaf */
+	FREE_RCU_SHEAF,		/* Free to rcu_free sheaf */
+	FREE_RCU_SHEAF_FAIL,	/* Failed to free to a rcu_free sheaf */
 	FREE_FASTPATH,		/* Free to cpu slab */
 	FREE_SLOWPATH,		/* Freeing not to cpu slab */
 	FREE_FROZEN,		/* Freeing to frozen slab */
@@ -461,6 +463,7 @@ struct slab_sheaf {
 		struct rcu_head rcu_head;
 		struct list_head barn_list;
 	};
+	struct kmem_cache *cache;
 	unsigned int size;
 	void *objects[];
 };
@@ -469,6 +472,7 @@ struct slub_percpu_sheaves {
 	local_trylock_t lock;
 	struct slab_sheaf *main; /* never NULL when unlocked */
 	struct slab_sheaf *spare; /* empty or full, may be NULL */
+	struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
 };
 
 /*
@@ -2531,6 +2535,8 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
 	if (unlikely(!sheaf))
 		return NULL;
 
+	sheaf->cache = s;
+
 	stat(s, SHEAF_ALLOC);
 
 	return sheaf;
@@ -2655,6 +2661,43 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
 	sheaf->size = 0;
 }
 
+static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
+				     struct slab_sheaf *sheaf)
+{
+	bool init = slab_want_init_on_free(s);
+	void **p = &sheaf->objects[0];
+	unsigned int i = 0;
+
+	while (i < sheaf->size) {
+		struct slab *slab = virt_to_slab(p[i]);
+
+		memcg_slab_free_hook(s, slab, p + i, 1);
+		alloc_tagging_slab_free_hook(s, slab, p + i, 1);
+
+		if (unlikely(!slab_free_hook(s, p[i], init, true))) {
+			p[i] = p[--sheaf->size];
+			continue;
+		}
+
+		i++;
+	}
+}
+
+static void rcu_free_sheaf_nobarn(struct rcu_head *head)
+{
+	struct slab_sheaf *sheaf;
+	struct kmem_cache *s;
+
+	sheaf = container_of(head, struct slab_sheaf, rcu_head);
+	s = sheaf->cache;
+
+	__rcu_free_sheaf_prepare(s, sheaf);
+
+	sheaf_flush_unused(s, sheaf);
+
+	free_empty_sheaf(s, sheaf);
+}
+
 /*
  * Caller needs to make sure migration is disabled in order to fully flush
  * single cpu's sheaves
@@ -2667,7 +2710,7 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
 static void pcs_flush_all(struct kmem_cache *s)
 {
 	struct slub_percpu_sheaves *pcs;
-	struct slab_sheaf *spare;
+	struct slab_sheaf *spare, *rcu_free;
 
 	local_lock(&s->cpu_sheaves->lock);
 	pcs = this_cpu_ptr(s->cpu_sheaves);
@@ -2675,6 +2718,9 @@ static void pcs_flush_all(struct kmem_cache *s)
 	spare = pcs->spare;
 	pcs->spare = NULL;
 
+	rcu_free = pcs->rcu_free;
+	pcs->rcu_free = NULL;
+
 	local_unlock(&s->cpu_sheaves->lock);
 
 	if (spare) {
@@ -2682,6 +2728,9 @@ static void pcs_flush_all(struct kmem_cache *s)
 		free_empty_sheaf(s, spare);
 	}
 
+	if (rcu_free)
+		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+
 	sheaf_flush_main(s);
 }
 
@@ -2698,6 +2747,11 @@ static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
 		free_empty_sheaf(s, pcs->spare);
 		pcs->spare = NULL;
 	}
+
+	if (pcs->rcu_free) {
+		call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+		pcs->rcu_free = NULL;
+	}
 }
 
 static void pcs_destroy(struct kmem_cache *s)
@@ -2723,6 +2777,7 @@ static void pcs_destroy(struct kmem_cache *s)
 		 */
 
 		WARN_ON(pcs->spare);
+		WARN_ON(pcs->rcu_free);
 
 		if (!WARN_ON(pcs->main->size)) {
 			free_empty_sheaf(s, pcs->main);
@@ -3780,7 +3835,7 @@ static bool has_pcs_used(int cpu, struct kmem_cache *s)
 
 	pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 
-	return (pcs->spare || pcs->main->size);
+	return (pcs->spare || pcs->rcu_free || pcs->main->size);
 }
 
 /*
@@ -3840,6 +3895,77 @@ static void flush_all(struct kmem_cache *s)
 	cpus_read_unlock();
 }
 
+static void flush_rcu_sheaf(struct work_struct *w)
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slab_sheaf *rcu_free;
+	struct slub_flush_work *sfw;
+	struct kmem_cache *s;
+
+	sfw = container_of(w, struct slub_flush_work, work);
+	s = sfw->s;
+
+	local_lock(&s->cpu_sheaves->lock);
+	pcs = this_cpu_ptr(s->cpu_sheaves);
+
+	rcu_free = pcs->rcu_free;
+	pcs->rcu_free = NULL;
+
+	local_unlock(&s->cpu_sheaves->lock);
+
+	if (rcu_free)
+		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+}
+
+
+/* needed for kvfree_rcu_barrier() */
+void flush_all_rcu_sheaves(void)
+{
+	struct slub_flush_work *sfw;
+	struct kmem_cache *s;
+	unsigned int cpu;
+
+	cpus_read_lock();
+	mutex_lock(&slab_mutex);
+
+	list_for_each_entry(s, &slab_caches, list) {
+		if (!s->cpu_sheaves)
+			continue;
+
+		mutex_lock(&flush_lock);
+
+		for_each_online_cpu(cpu) {
+			sfw = &per_cpu(slub_flush, cpu);
+
+			/*
+			 * we don't check if rcu_free sheaf exists - racing
+			 * __kfree_rcu_sheaf() might have just removed it.
+			 * by executing flush_rcu_sheaf() on the cpu we make
+			 * sure the __kfree_rcu_sheaf() finished its call_rcu()
+			 */
+
+			INIT_WORK(&sfw->work, flush_rcu_sheaf);
+			sfw->skip = false;
+			sfw->s = s;
+			queue_work_on(cpu, flushwq, &sfw->work);
+		}
+
+		for_each_online_cpu(cpu) {
+			sfw = &per_cpu(slub_flush, cpu);
+			if (sfw->skip)
+				continue;
+			flush_work(&sfw->work);
+		}
+
+		mutex_unlock(&flush_lock);
+	}
+
+	mutex_unlock(&slab_mutex);
+	cpus_read_unlock();
+
+	rcu_barrier();
+}
+
 /*
  * Use the cpu notifier to insure that the cpu slabs are flushed when
  * necessary.
@@ -5413,6 +5539,134 @@ bool free_to_pcs(struct kmem_cache *s, void *object)
 	return true;
 }
 
+static void rcu_free_sheaf(struct rcu_head *head)
+{
+	struct slab_sheaf *sheaf;
+	struct node_barn *barn;
+	struct kmem_cache *s;
+
+	sheaf = container_of(head, struct slab_sheaf, rcu_head);
+
+	s = sheaf->cache;
+
+	/*
+	 * This may remove some objects due to slab_free_hook() returning false,
+	 * so that the sheaf might no longer be completely full. But it's easier
+	 * to handle it as full (unless it became completely empty), as the code
+	 * handles it fine. The only downside is that sheaf will serve fewer
+	 * allocations when reused. It only happens due to debugging, which is a
+	 * performance hit anyway.
+	 */
+	__rcu_free_sheaf_prepare(s, sheaf);
+
+	barn = get_node(s, numa_mem_id())->barn;
+
+	/* due to slab_free_hook() */
+	if (unlikely(sheaf->size == 0))
+		goto empty;
+
+	/*
+	 * Checking nr_full/nr_empty outside lock avoids contention in case the
+	 * barn is at the respective limit. Due to the race we might go over the
+	 * limit but that should be rare and harmless.
+	 */
+
+	if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) {
+		stat(s, BARN_PUT);
+		barn_put_full_sheaf(barn, sheaf);
+		return;
+	}
+
+	stat(s, BARN_PUT_FAIL);
+	sheaf_flush_unused(s, sheaf);
+
+empty:
+	if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
+		barn_put_empty_sheaf(barn, sheaf);
+		return;
+	}
+
+	free_empty_sheaf(s, sheaf);
+}
+
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slab_sheaf *rcu_sheaf;
+
+	if (!local_trylock(&s->cpu_sheaves->lock))
+		goto fail;
+
+	pcs = this_cpu_ptr(s->cpu_sheaves);
+
+	if (unlikely(!pcs->rcu_free)) {
+
+		struct slab_sheaf *empty;
+		struct node_barn *barn;
+
+		if (pcs->spare && pcs->spare->size == 0) {
+			pcs->rcu_free = pcs->spare;
+			pcs->spare = NULL;
+			goto do_free;
+		}
+
+		barn = get_barn(s);
+
+		empty = barn_get_empty_sheaf(barn);
+
+		if (empty) {
+			pcs->rcu_free = empty;
+			goto do_free;
+		}
+
+		local_unlock(&s->cpu_sheaves->lock);
+
+		empty = alloc_empty_sheaf(s, GFP_NOWAIT);
+
+		if (!empty)
+			goto fail;
+
+		if (!local_trylock(&s->cpu_sheaves->lock)) {
+			barn_put_empty_sheaf(barn, empty);
+			goto fail;
+		}
+
+		pcs = this_cpu_ptr(s->cpu_sheaves);
+
+		if (unlikely(pcs->rcu_free))
+			barn_put_empty_sheaf(barn, empty);
+		else
+			pcs->rcu_free = empty;
+	}
+
+do_free:
+
+	rcu_sheaf = pcs->rcu_free;
+
+	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
+
+	if (likely(rcu_sheaf->size < s->sheaf_capacity))
+		rcu_sheaf = NULL;
+	else
+		pcs->rcu_free = NULL;
+
+	/*
+	 * we flush before local_unlock to make sure a racing
+	 * flush_all_rcu_sheaves() doesn't miss this sheaf
+	 */
+	if (rcu_sheaf)
+		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
+
+	local_unlock(&s->cpu_sheaves->lock);
+
+	stat(s, FREE_RCU_SHEAF);
+	return true;
+
+fail:
+	stat(s, FREE_RCU_SHEAF_FAIL);
+	return false;
+}
+
 /*
  * Bulk free objects to the percpu sheaves.
  * Unlike free_to_pcs() this includes the calls to all necessary hooks
@@ -6909,6 +7163,11 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 	struct kmem_cache_node *n;
 
 	flush_all_cpus_locked(s);
+
+	/* we might have rcu sheaves in flight */
+	if (s->cpu_sheaves)
+		rcu_barrier();
+
 	/* Attempt to free all objects */
 	for_each_kmem_cache_node(s, node, n) {
 		if (n->barn)
@@ -8284,6 +8543,8 @@ STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf);
 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
 STAT_ATTR(FREE_PCS, free_cpu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail);
 STAT_ATTR(FREE_FASTPATH, free_fastpath);
 STAT_ATTR(FREE_SLOWPATH, free_slowpath);
 STAT_ATTR(FREE_FROZEN, free_frozen);
@@ -8382,6 +8643,8 @@ static struct attribute *slab_attrs[] = {
 	&alloc_fastpath_attr.attr,
 	&alloc_slowpath_attr.attr,
 	&free_cpu_sheaf_attr.attr,
+	&free_rcu_sheaf_attr.attr,
+	&free_rcu_sheaf_fail_attr.attr,
 	&free_fastpath_attr.attr,
 	&free_slowpath_attr.attr,
 	&free_frozen_attr.attr,
-- 
2.51.0

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Harry Yoo 1 week, 6 days ago

On Thu, Sep 18, 2025 at 10:09:34AM +0200, Vlastimil Babka wrote:
> On 9/17/25 16:14, Vlastimil Babka wrote:
> > On 9/17/25 15:34, Harry Yoo wrote:
> >> On Wed, Sep 17, 2025 at 03:21:31PM +0200, Vlastimil Babka wrote:
> >>> On 9/17/25 15:07, Harry Yoo wrote:
> >>> > On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
> >>> >> On 9/17/25 13:32, Harry Yoo wrote:
> >>> >> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
> >>> >> >> On 9/17/25 10:30, Harry Yoo wrote:
> >>> >> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> >>> >> >> >> +				sfw->skip = true;
> >>> >> >> >> +				continue;
> >>> >> >> >> +			}
> >>> >> >> >>
> >>> >> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> >>> >> >> >> +			sfw->skip = false;
> >>> >> >> >> +			sfw->s = s;
> >>> >> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
> >>> >> >> >> +			flushed = true;
> >>> >> >> >> +		}
> >>> >> >> >> +
> >>> >> >> >> +		for_each_online_cpu(cpu) {
> >>> >> >> >> +			sfw = &per_cpu(slub_flush, cpu);
> >>> >> >> >> +			if (sfw->skip)
> >>> >> >> >> +				continue;
> >>> >> >> >> +			flush_work(&sfw->work);
> >>> >> >> >> +		}
> >>> >> >> >> +
> >>> >> >> >> +		mutex_unlock(&flush_lock);
> >>> >> >> >> +	}
> >>> >> >> >> +
> >>> >> >> >> +	mutex_unlock(&slab_mutex);
> >>> >> >> >> +	cpus_read_unlock();
> >>> >> >> >> +
> >>> >> >> >> +	if (flushed)
> >>> >> >> >> +		rcu_barrier();
> >>> >> >> > 
> >>> >> >> > I think we need to call rcu_barrier() even if flushed == false?
> >>> >> >> > 
> >>> >> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> >>> >> >> > be processed before flush_all_rcu_sheaves() is called, and
> >>> >> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> >>> >> >> > so flushed == false but the rcu callback isn't processed yet
> >>> >> >> > by the end of the function?
> >>> >> >> > 
> >>> >> >> > That sounds like a very unlikely to happen in a realistic scenario,
> >>> >> >> > but still possible...
> >>> >> >> 
> >>> >> >> Yes also good point, will flush unconditionally.
> >>> >> >> 
> >>> >> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
> >>> >> >> local_unlock().
> >>> >> >>
> >>> >> >> So we don't end up seeing a NULL pcs->rcu_free in
> >>> >> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
> >>> >> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
> >>> >> > 
> >>> >> > Makes sense to me.
> >>> > 
> >>> > Wait, I'm confused.
> >>> > 
> >>> > I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
> >>> > only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
> >>> > the object X to be freed before kvfree_rcu_barrier() returns?
> >>> 
> >>> Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
> >>> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
> >>> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
> >>> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
> >>> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
> >>> up the rcu_sheaf fully and is about to call_rcu() on it. And since that
> >>> sheaf also contains the object X, we should make sure that is flushed.
> >> 
> >> I was going to say "but we queue and wait for the flushing work to
> >> complete, so the sheaf containing object X should be flushed?"
> >> 
> >> But nah, that's true only if we see pcs->rcu_free != NULL in
> >> flush_all_rcu_sheaves().
> >> 
> >> You are right...
> >> 
> >> Hmm, maybe it's simpler to fix this by never skipping queueing the work
> >> even when pcs->rcu_sheaf == NULL?
> > 
> > I guess it's simpler, yeah.
> 
> So what about this? The unconditional queueing should cover all races with
> __kfree_rcu_sheaf() so there's just unconditional rcu_barrier() in the end.
> 
> From 0722b29fa1625b31c05d659d1d988ec882247b38 Mon Sep 17 00:00:00 2001
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Wed, 3 Sep 2025 14:59:46 +0200
> Subject: [PATCH] slab: add sheaf support for batching kfree_rcu() operations
> 
> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> addition to main and spare sheaves.
> 
> kfree_rcu() operations will try to put objects on this sheaf. Once full,
> the sheaf is detached and submitted to call_rcu() with a handler that
> will try to put it in the barn, or flush to slab pages using bulk free,
> when the barn is full. Then a new empty sheaf must be obtained to put
> more objects there.
> 
> It's possible that no free sheaves are available to use for a new
> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> kfree_rcu() implementation.
> 
> Expected advantages:
> - batching the kfree_rcu() operations, that could eventually replace the
>   existing batching
> - sheaves can be reused for allocations via barn instead of being
>   flushed to slabs, which is more efficient
>   - this includes cases where only some cpus are allowed to process rcu
>     callbacks (Android)
> 
> Possible disadvantage:
> - objects might be waiting for more than their grace period (it is
>   determined by the last object freed into the sheaf), increasing memory
>   usage - but the existing batching does that too.
> 
> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> implementation favors smaller memory footprint over performance.
> 
> Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
> contexts where kfree_rcu() is called might not be compatible with taking
> a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
> spinlock - the current kfree_rcu() implementation avoids doing that.
> 
> Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
> that have them. This is not a cheap operation, but the barrier usage is
> rare - currently kmem_cache_destroy() or on module unload.
> 
> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> count how many kfree_rcu() used the rcu_free sheaf successfully and how
> many had to fall back to the existing implementation.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---

Looks good to me,
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>

with a nit:

> +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
> +{
> +	struct slub_percpu_sheaves *pcs;
> +	struct slab_sheaf *rcu_sheaf;
> +
> +	if (!local_trylock(&s->cpu_sheaves->lock))
> +		goto fail;
> +
> +	pcs = this_cpu_ptr(s->cpu_sheaves);
> +
> +	if (unlikely(!pcs->rcu_free)) {
> +
> +		struct slab_sheaf *empty;
> +		struct node_barn *barn;
> +
> +		if (pcs->spare && pcs->spare->size == 0) {
> +			pcs->rcu_free = pcs->spare;
> +			pcs->spare = NULL;
> +			goto do_free;
> +		}
> +
> +		barn = get_barn(s);
> +
> +		empty = barn_get_empty_sheaf(barn);
> +
> +		if (empty) {
> +			pcs->rcu_free = empty;
> +			goto do_free;
> +		}
> +
> +		local_unlock(&s->cpu_sheaves->lock);
> +
> +		empty = alloc_empty_sheaf(s, GFP_NOWAIT);
> +
> +		if (!empty)
> +			goto fail;
> +
> +		if (!local_trylock(&s->cpu_sheaves->lock)) {
> +			barn_put_empty_sheaf(barn, empty);
> +			goto fail;
> +		}
> +
> +		pcs = this_cpu_ptr(s->cpu_sheaves);
> +
> +		if (unlikely(pcs->rcu_free))
> +			barn_put_empty_sheaf(barn, empty);
> +		else
> +			pcs->rcu_free = empty;
> +	}
> +
> +do_free:
> +
> +	rcu_sheaf = pcs->rcu_free;
> +
> +	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
> +
> +	if (likely(rcu_sheaf->size < s->sheaf_capacity))
> +		rcu_sheaf = NULL;
> +	else
> +		pcs->rcu_free = NULL;
> +
> +	/*
> +	 * we flush before local_unlock to make sure a racing
> +	 * flush_all_rcu_sheaves() doesn't miss this sheaf
> +	 */
> +	if (rcu_sheaf)
> +		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);

nit: now we don't have to put this inside local_lock()~local_unlock()?

-- 
Cheers,
Harry / Hyeonggon

> +	local_unlock(&s->cpu_sheaves->lock);
> +
> +	stat(s, FREE_RCU_SHEAF);
> +	return true;
> +
> +fail:
> +	stat(s, FREE_RCU_SHEAF_FAIL);
> +	return false;
> +}

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Vlastimil Babka 1 week, 6 days ago

On 9/19/25 08:47, Harry Yoo wrote:
> On Thu, Sep 18, 2025 at 10:09:34AM +0200, Vlastimil Babka wrote:
>> On 9/17/25 16:14, Vlastimil Babka wrote:
>> > On 9/17/25 15:34, Harry Yoo wrote:
>> >> On Wed, Sep 17, 2025 at 03:21:31PM +0200, Vlastimil Babka wrote:
>> >>> On 9/17/25 15:07, Harry Yoo wrote:
>> >>> > On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
>> >>> >> On 9/17/25 13:32, Harry Yoo wrote:
>> >>> >> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
>> >>> >> >> On 9/17/25 10:30, Harry Yoo wrote:
>> >>> >> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
>> >>> >> >> >> +				sfw->skip = true;
>> >>> >> >> >> +				continue;
>> >>> >> >> >> +			}
>> >>> >> >> >>
>> >>> >> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
>> >>> >> >> >> +			sfw->skip = false;
>> >>> >> >> >> +			sfw->s = s;
>> >>> >> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
>> >>> >> >> >> +			flushed = true;
>> >>> >> >> >> +		}
>> >>> >> >> >> +
>> >>> >> >> >> +		for_each_online_cpu(cpu) {
>> >>> >> >> >> +			sfw = &per_cpu(slub_flush, cpu);
>> >>> >> >> >> +			if (sfw->skip)
>> >>> >> >> >> +				continue;
>> >>> >> >> >> +			flush_work(&sfw->work);
>> >>> >> >> >> +		}
>> >>> >> >> >> +
>> >>> >> >> >> +		mutex_unlock(&flush_lock);
>> >>> >> >> >> +	}
>> >>> >> >> >> +
>> >>> >> >> >> +	mutex_unlock(&slab_mutex);
>> >>> >> >> >> +	cpus_read_unlock();
>> >>> >> >> >> +
>> >>> >> >> >> +	if (flushed)
>> >>> >> >> >> +		rcu_barrier();
>> >>> >> >> > 
>> >>> >> >> > I think we need to call rcu_barrier() even if flushed == false?
>> >>> >> >> > 
>> >>> >> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
>> >>> >> >> > be processed before flush_all_rcu_sheaves() is called, and
>> >>> >> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
>> >>> >> >> > so flushed == false but the rcu callback isn't processed yet
>> >>> >> >> > by the end of the function?
>> >>> >> >> > 
>> >>> >> >> > That sounds like a very unlikely to happen in a realistic scenario,
>> >>> >> >> > but still possible...
>> >>> >> >> 
>> >>> >> >> Yes also good point, will flush unconditionally.
>> >>> >> >> 
>> >>> >> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
>> >>> >> >> local_unlock().
>> >>> >> >>
>> >>> >> >> So we don't end up seeing a NULL pcs->rcu_free in
>> >>> >> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
>> >>> >> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
>> >>> >> > 
>> >>> >> > Makes sense to me.
>> >>> > 
>> >>> > Wait, I'm confused.
>> >>> > 
>> >>> > I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
>> >>> > only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
>> >>> > the object X to be freed before kvfree_rcu_barrier() returns?
>> >>> 
>> >>> Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
>> >>> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
>> >>> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
>> >>> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
>> >>> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
>> >>> up the rcu_sheaf fully and is about to call_rcu() on it. And since that
>> >>> sheaf also contains the object X, we should make sure that is flushed.
>> >> 
>> >> I was going to say "but we queue and wait for the flushing work to
>> >> complete, so the sheaf containing object X should be flushed?"
>> >> 
>> >> But nah, that's true only if we see pcs->rcu_free != NULL in
>> >> flush_all_rcu_sheaves().
>> >> 
>> >> You are right...
>> >> 
>> >> Hmm, maybe it's simpler to fix this by never skipping queueing the work
>> >> even when pcs->rcu_sheaf == NULL?
>> > 
>> > I guess it's simpler, yeah.
>> 
>> So what about this? The unconditional queueing should cover all races with
>> __kfree_rcu_sheaf() so there's just unconditional rcu_barrier() in the end.
>> 
>> From 0722b29fa1625b31c05d659d1d988ec882247b38 Mon Sep 17 00:00:00 2001
>> From: Vlastimil Babka <vbabka@suse.cz>
>> Date: Wed, 3 Sep 2025 14:59:46 +0200
>> Subject: [PATCH] slab: add sheaf support for batching kfree_rcu() operations
>> 
>> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
>> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
>> addition to main and spare sheaves.
>> 
>> kfree_rcu() operations will try to put objects on this sheaf. Once full,
>> the sheaf is detached and submitted to call_rcu() with a handler that
>> will try to put it in the barn, or flush to slab pages using bulk free,
>> when the barn is full. Then a new empty sheaf must be obtained to put
>> more objects there.
>> 
>> It's possible that no free sheaves are available to use for a new
>> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
>> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
>> kfree_rcu() implementation.
>> 
>> Expected advantages:
>> - batching the kfree_rcu() operations, that could eventually replace the
>>   existing batching
>> - sheaves can be reused for allocations via barn instead of being
>>   flushed to slabs, which is more efficient
>>   - this includes cases where only some cpus are allowed to process rcu
>>     callbacks (Android)
>> 
>> Possible disadvantage:
>> - objects might be waiting for more than their grace period (it is
>>   determined by the last object freed into the sheaf), increasing memory
>>   usage - but the existing batching does that too.
>> 
>> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
>> implementation favors smaller memory footprint over performance.
>> 
>> Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
>> contexts where kfree_rcu() is called might not be compatible with taking
>> a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
>> spinlock - the current kfree_rcu() implementation avoids doing that.
>> 
>> Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
>> that have them. This is not a cheap operation, but the barrier usage is
>> rare - currently kmem_cache_destroy() or on module unload.
>> 
>> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
>> count how many kfree_rcu() used the rcu_free sheaf successfully and how
>> many had to fall back to the existing implementation.
>> 
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> ---
> 
> Looks good to me,
> Reviewed-by: Harry Yoo <harry.yoo@oracle.com>

Thanks.

>> +do_free:
>> +
>> +	rcu_sheaf = pcs->rcu_free;
>> +
>> +	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
>> +
>> +	if (likely(rcu_sheaf->size < s->sheaf_capacity))
>> +		rcu_sheaf = NULL;
>> +	else
>> +		pcs->rcu_free = NULL;
>> +
>> +	/*
>> +	 * we flush before local_unlock to make sure a racing
>> +	 * flush_all_rcu_sheaves() doesn't miss this sheaf
>> +	 */
>> +	if (rcu_sheaf)
>> +		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
> 
> nit: now we don't have to put this inside local_lock()~local_unlock()?

I think we still need to? AFAICS I wrote before is still true:

The caller of kvfree_rcu(X, rhs) might have returned without filling up
the rcu_sheaf fully and thus without submitting it to call_rcu(), then
migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
up the rcu_sheaf fully and is about to call_rcu() on it.

If it can local_unlock() before doing the call_rcu(), it can local_unlock(),
get preempted, and our flush worqueue handler will only see there's no
rcu_free sheaf and do nothing.

If if must call_rcu() before local_unlock(), our flush workqueue handler
will not execute on the cpu until it performs the call_rcu() and
local_unlock(), because it can't preempt that section (!RT) or will have to
wait doing local_lock() in flush_rcu_sheaf() (RT) - here it's important it
takes the lock unconditionally.

Or am I missing something?

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Harry Yoo 1 week, 6 days ago

On Fri, Sep 19, 2025 at 09:02:22AM +0200, Vlastimil Babka wrote:
> On 9/19/25 08:47, Harry Yoo wrote:
> > On Thu, Sep 18, 2025 at 10:09:34AM +0200, Vlastimil Babka wrote:
> >> On 9/17/25 16:14, Vlastimil Babka wrote:
> >> > On 9/17/25 15:34, Harry Yoo wrote:
> >> >> On Wed, Sep 17, 2025 at 03:21:31PM +0200, Vlastimil Babka wrote:
> >> >>> On 9/17/25 15:07, Harry Yoo wrote:
> >> >>> > On Wed, Sep 17, 2025 at 02:05:49PM +0200, Vlastimil Babka wrote:
> >> >>> >> On 9/17/25 13:32, Harry Yoo wrote:
> >> >>> >> > On Wed, Sep 17, 2025 at 11:55:10AM +0200, Vlastimil Babka wrote:
> >> >>> >> >> On 9/17/25 10:30, Harry Yoo wrote:
> >> >>> >> >> > On Wed, Sep 10, 2025 at 10:01:06AM +0200, Vlastimil Babka wrote:
> >> >>> >> >> >> +				sfw->skip = true;
> >> >>> >> >> >> +				continue;
> >> >>> >> >> >> +			}
> >> >>> >> >> >>
> >> >>> >> >> >> +			INIT_WORK(&sfw->work, flush_rcu_sheaf);
> >> >>> >> >> >> +			sfw->skip = false;
> >> >>> >> >> >> +			sfw->s = s;
> >> >>> >> >> >> +			queue_work_on(cpu, flushwq, &sfw->work);
> >> >>> >> >> >> +			flushed = true;
> >> >>> >> >> >> +		}
> >> >>> >> >> >> +
> >> >>> >> >> >> +		for_each_online_cpu(cpu) {
> >> >>> >> >> >> +			sfw = &per_cpu(slub_flush, cpu);
> >> >>> >> >> >> +			if (sfw->skip)
> >> >>> >> >> >> +				continue;
> >> >>> >> >> >> +			flush_work(&sfw->work);
> >> >>> >> >> >> +		}
> >> >>> >> >> >> +
> >> >>> >> >> >> +		mutex_unlock(&flush_lock);
> >> >>> >> >> >> +	}
> >> >>> >> >> >> +
> >> >>> >> >> >> +	mutex_unlock(&slab_mutex);
> >> >>> >> >> >> +	cpus_read_unlock();
> >> >>> >> >> >> +
> >> >>> >> >> >> +	if (flushed)
> >> >>> >> >> >> +		rcu_barrier();
> >> >>> >> >> > 
> >> >>> >> >> > I think we need to call rcu_barrier() even if flushed == false?
> >> >>> >> >> > 
> >> >>> >> >> > Maybe a kvfree_rcu()'d object was already waiting for the rcu callback to
> >> >>> >> >> > be processed before flush_all_rcu_sheaves() is called, and
> >> >>> >> >> > in flush_all_rcu_sheaves() we skipped all (cache, cpu) pairs,
> >> >>> >> >> > so flushed == false but the rcu callback isn't processed yet
> >> >>> >> >> > by the end of the function?
> >> >>> >> >> > 
> >> >>> >> >> > That sounds like a very unlikely to happen in a realistic scenario,
> >> >>> >> >> > but still possible...
> >> >>> >> >> 
> >> >>> >> >> Yes also good point, will flush unconditionally.
> >> >>> >> >> 
> >> >>> >> >> Maybe in __kfree_rcu_sheaf() I should also move the call_rcu(...) before
> >> >>> >> >> local_unlock().
> >> >>> >> >>
> >> >>> >> >> So we don't end up seeing a NULL pcs->rcu_free in
> >> >>> >> >> flush_all_rcu_sheaves() because __kfree_rcu_sheaf() already set it to NULL,
> >> >>> >> >> but didn't yet do the call_rcu() as it got preempted after local_unlock().
> >> >>> >> > 
> >> >>> >> > Makes sense to me.
> >> >>> > 
> >> >>> > Wait, I'm confused.
> >> >>> > 
> >> >>> > I think the caller of kvfree_rcu_barrier() should make sure that it's invoked
> >> >>> > only after a kvfree_rcu(X, rhs) call has returned, if the caller expects
> >> >>> > the object X to be freed before kvfree_rcu_barrier() returns?
> >> >>> 
> >> >>> Hmm, the caller of kvfree_rcu(X, rhs) might have returned without filling up
> >> >>> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
> >> >>> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
> >> >>> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
> >> >>> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
> >> >>> up the rcu_sheaf fully and is about to call_rcu() on it. And since that
> >> >>> sheaf also contains the object X, we should make sure that is flushed.
> >> >> 
> >> >> I was going to say "but we queue and wait for the flushing work to
> >> >> complete, so the sheaf containing object X should be flushed?"
> >> >> 
> >> >> But nah, that's true only if we see pcs->rcu_free != NULL in
> >> >> flush_all_rcu_sheaves().
> >> >> 
> >> >> You are right...
> >> >> 
> >> >> Hmm, maybe it's simpler to fix this by never skipping queueing the work
> >> >> even when pcs->rcu_sheaf == NULL?
> >> > 
> >> > I guess it's simpler, yeah.
> >> 
> >> So what about this? The unconditional queueing should cover all races with
> >> __kfree_rcu_sheaf() so there's just unconditional rcu_barrier() in the end.
> >> 
> >> From 0722b29fa1625b31c05d659d1d988ec882247b38 Mon Sep 17 00:00:00 2001
> >> From: Vlastimil Babka <vbabka@suse.cz>
> >> Date: Wed, 3 Sep 2025 14:59:46 +0200
> >> Subject: [PATCH] slab: add sheaf support for batching kfree_rcu() operations
> >> 
> >> Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
> >> For caches with sheaves, on each cpu maintain a rcu_free sheaf in
> >> addition to main and spare sheaves.
> >> 
> >> kfree_rcu() operations will try to put objects on this sheaf. Once full,
> >> the sheaf is detached and submitted to call_rcu() with a handler that
> >> will try to put it in the barn, or flush to slab pages using bulk free,
> >> when the barn is full. Then a new empty sheaf must be obtained to put
> >> more objects there.
> >> 
> >> It's possible that no free sheaves are available to use for a new
> >> rcu_free sheaf, and the allocation in kfree_rcu() context can only use
> >> GFP_NOWAIT and thus may fail. In that case, fall back to the existing
> >> kfree_rcu() implementation.
> >> 
> >> Expected advantages:
> >> - batching the kfree_rcu() operations, that could eventually replace the
> >>   existing batching
> >> - sheaves can be reused for allocations via barn instead of being
> >>   flushed to slabs, which is more efficient
> >>   - this includes cases where only some cpus are allowed to process rcu
> >>     callbacks (Android)
> >> 
> >> Possible disadvantage:
> >> - objects might be waiting for more than their grace period (it is
> >>   determined by the last object freed into the sheaf), increasing memory
> >>   usage - but the existing batching does that too.
> >> 
> >> Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
> >> implementation favors smaller memory footprint over performance.
> >> 
> >> Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
> >> contexts where kfree_rcu() is called might not be compatible with taking
> >> a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
> >> spinlock - the current kfree_rcu() implementation avoids doing that.
> >> 
> >> Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
> >> that have them. This is not a cheap operation, but the barrier usage is
> >> rare - currently kmem_cache_destroy() or on module unload.
> >> 
> >> Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
> >> count how many kfree_rcu() used the rcu_free sheaf successfully and how
> >> many had to fall back to the existing implementation.
> >> 
> >> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >> ---
> > 
> > Looks good to me,
> > Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
> 
> Thanks.
> 
> >> +do_free:
> >> +
> >> +	rcu_sheaf = pcs->rcu_free;
> >> +
> >> +	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
> >> +
> >> +	if (likely(rcu_sheaf->size < s->sheaf_capacity))
> >> +		rcu_sheaf = NULL;
> >> +	else
> >> +		pcs->rcu_free = NULL;
> >> +
> >> +	/*
> >> +	 * we flush before local_unlock to make sure a racing
> >> +	 * flush_all_rcu_sheaves() doesn't miss this sheaf
> >> +	 */
> >> +	if (rcu_sheaf)
> >> +		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
> > 
> > nit: now we don't have to put this inside local_lock()~local_unlock()?
> 
> I think we still need to? AFAICS I wrote before is still true:
> 
> The caller of kvfree_rcu(X, rhs) might have returned without filling up
> the rcu_sheaf fully and thus without submitting it to call_rcu(), then
> migrated to another cpu. Then it calls kvfree_rcu_barrier() while another
> unrelated kvfree_rcu(X, rhs) call on the previous cpu is for the same
> kmem_cache (kvfree_rcu_barrier() is not only for cache destruction), fills
> up the rcu_sheaf fully and is about to call_rcu() on it.
>
> If it can local_unlock() before doing the call_rcu(), it can local_unlock(),
> get preempted, and our flush worqueue handler will only see there's no
> rcu_free sheaf and do nothing.

Oops, you're right. So even if a previous kvfree_rcu() has returned
and then kvfree_rcu_barrier() is called, a later kvfree_rcu() call can
make the sheaf invisible to the flush workqueue handler if it calls
call_rcu() outside the critical section because it can be preempted by
the workqueue handler after local_unlock() but before calling
call_rcu().

> If if must call_rcu() before local_unlock(), our flush workqueue handler
> will not execute on the cpu until it performs the call_rcu() and
> local_unlock(), because it can't preempt that section (!RT) or will have to
> wait doing local_lock() in flush_rcu_sheaf() (RT) - here it's important it
> takes the lock unconditionally.

Right.

My nit was wrong and it looks good to me then!

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Sergey Senozhatsky 2 weeks, 6 days ago

Hi Vlastimil,

On (25/09/10 10:01), Vlastimil Babka wrote:
[..]
> +
> +	if (rcu_free)
> +		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
> +}
> +
> +
> +/* needed for kvfree_rcu_barrier() */
> +void flush_all_rcu_sheaves()
> +{

mm/slub.c:3960:27: error: a function declaration without a prototype is deprecated in all versions of C [-Werror,-Wstrict-prototypes]
 3960 | void flush_all_rcu_sheaves()
      |                           ^
      |                            void

---

diff --git a/mm/slub.c b/mm/slub.c
index 11ad4173e2f2..a1eae71a0f8c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3955,9 +3955,8 @@ static void flush_rcu_sheaf(struct work_struct *w)
 		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
 }
 
-
 /* needed for kvfree_rcu_barrier() */
-void flush_all_rcu_sheaves()
+void flush_all_rcu_sheaves(void)
 {
 	struct slub_percpu_sheaves *pcs;
 	struct slub_flush_work *sfw;

Re: [PATCH v8 04/23] slab: add sheaf support for batching kfree_rcu() operations

Posted by Vlastimil Babka 2 weeks, 6 days ago

On 9/12/25 02:38, Sergey Senozhatsky wrote:
> Hi Vlastimil,
> 
> On (25/09/10 10:01), Vlastimil Babka wrote:
> [..]
>> +
>> +	if (rcu_free)
>> +		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
>> +}
>> +
>> +
>> +/* needed for kvfree_rcu_barrier() */
>> +void flush_all_rcu_sheaves()
>> +{
> 
> mm/slub.c:3960:27: error: a function declaration without a prototype is deprecated in all versions of C [-Werror,-Wstrict-prototypes]
>  3960 | void flush_all_rcu_sheaves()
>       |                           ^
>       |                            void
> 
> ---

Thanks, the bots told me too and it's fixed in -next

> diff --git a/mm/slub.c b/mm/slub.c
> index 11ad4173e2f2..a1eae71a0f8c 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -3955,9 +3955,8 @@ static void flush_rcu_sheaf(struct work_struct *w)
>  		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
>  }
>  
> -
>  /* needed for kvfree_rcu_barrier() */
> -void flush_all_rcu_sheaves()
> +void flush_all_rcu_sheaves(void)
>  {
>  	struct slub_percpu_sheaves *pcs;
>  	struct slub_flush_work *sfw;