[v2] SLUB percpu sheaves

[PATCH RFC v2 06/10] slab: sheaf prefilling for guaranteed allocations

Posted by Vlastimil Babka 12 months ago

Add functions for efficient guaranteed allocations e.g. in a critical
section that cannot sleep, when the exact number of allocations is not
known beforehand, but an upper limit can be calculated.

kmem_cache_prefill_sheaf() returns a sheaf containing at least given
number of objects.

kmem_cache_alloc_from_sheaf() will allocate an object from the sheaf
and is guaranteed not to fail until depleted.

kmem_cache_return_sheaf() is for giving the sheaf back to the slab
allocator after the critical section. This will also attempt to refill
it to cache's sheaf capacity for better efficiency of sheaves handling,
but it's not stricly necessary to succeed.

kmem_cache_refill_sheaf() can be used to refill a previously obtained
sheaf to requested size. If the current size is sufficient, it does
nothing. If the requested size exceeds cache's sheaf_capacity and the
sheaf's current capacity, the sheaf will be replaced with a new one,
hence the indirect pointer parameter.

kmem_cache_sheaf_size() can be used to query the current size.

The implementation supports requesting sizes that exceed cache's
sheaf_capacity, but it is not efficient - such sheaves are allocated
fresh in kmem_cache_prefill_sheaf() and flushed and freed immediately by
kmem_cache_return_sheaf(). kmem_cache_refill_sheaf() might be expecially
ineffective when replacing a sheaf with a new one of a larger capacity.
It is therefore better to size cache's sheaf_capacity accordingly.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h |  16 ++++
 mm/slub.c            | 227 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 243 insertions(+)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0e1b25228c77140d05b5b4433c9d7923de36ec05..dd01b67982e856b1b02f4f0e6fc557726e7f02a8 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -829,6 +829,22 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags,
 				   int node) __assume_slab_alignment __malloc;
 #define kmem_cache_alloc_node(...)	alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__))
 
+struct slab_sheaf *
+kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size);
+
+int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
+		struct slab_sheaf **sheafp, unsigned int size);
+
+void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
+				       struct slab_sheaf *sheaf);
+
+void *kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *cachep, gfp_t gfp,
+			struct slab_sheaf *sheaf) __assume_slab_alignment __malloc;
+#define kmem_cache_alloc_from_sheaf(...)	\
+			alloc_hooks(kmem_cache_alloc_from_sheaf_noprof(__VA_ARGS__))
+
+unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf);
+
 /*
  * These macros allow declaring a kmem_buckets * parameter alongside size, which
  * can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call
diff --git a/mm/slub.c b/mm/slub.c
index 3d7345e7e938d53950ed0d6abe8eb0e93cf8f5b1..c1df7cf22267f28f743404531bef921e25fac086 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -443,6 +443,8 @@ struct slab_sheaf {
 	union {
 		struct rcu_head rcu_head;
 		struct list_head barn_list;
+		/* only used for prefilled sheafs */
+		unsigned int capacity;
 	};
 	struct kmem_cache *cache;
 	unsigned int size;
@@ -2735,6 +2737,30 @@ static int barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf,
 	return ret;
 }
 
+static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn)
+{
+	struct slab_sheaf *sheaf = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&barn->lock, flags);
+
+	if (barn->nr_full) {
+		sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
+					barn_list);
+		list_del(&sheaf->barn_list);
+		barn->nr_full--;
+	} else if (barn->nr_empty) {
+		sheaf = list_first_entry(&barn->sheaves_empty,
+					 struct slab_sheaf, barn_list);
+		list_del(&sheaf->barn_list);
+		barn->nr_empty--;
+	}
+
+	spin_unlock_irqrestore(&barn->lock, flags);
+
+	return sheaf;
+}
+
 /*
  * If a full sheaf is available, return it and put the supplied empty one to
  * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't
@@ -4831,6 +4857,207 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
 
+
+/*
+ * returns a sheaf that has least the requested size
+ * when prefilling is needed, do so with given gfp flags
+ *
+ * return NULL if sheaf allocation or prefilling failed
+ */
+struct slab_sheaf *
+kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
+{
+	struct slub_percpu_sheaves *pcs;
+	struct slab_sheaf *sheaf = NULL;
+
+	if (unlikely(size > s->sheaf_capacity)) {
+		sheaf = kzalloc(struct_size(sheaf, objects, size), gfp);
+		if (!sheaf)
+			return NULL;
+
+		sheaf->cache = s;
+		sheaf->capacity = size;
+
+		if (!__kmem_cache_alloc_bulk(s, gfp, size,
+					     &sheaf->objects[0])) {
+			kfree(sheaf);
+			return NULL;
+		}
+
+		sheaf->size = size;
+
+		return sheaf;
+	}
+
+	localtry_lock(&s->cpu_sheaves->lock);
+	pcs = this_cpu_ptr(s->cpu_sheaves);
+
+	if (pcs->spare) {
+		sheaf = pcs->spare;
+		pcs->spare = NULL;
+	}
+
+	if (!sheaf)
+		sheaf = barn_get_full_or_empty_sheaf(pcs->barn);
+
+	localtry_unlock(&s->cpu_sheaves->lock);
+
+	if (!sheaf) {
+		sheaf = alloc_empty_sheaf(s, gfp);
+	}
+
+	if (sheaf && sheaf->size < size) {
+		if (refill_sheaf(s, sheaf, gfp)) {
+			sheaf_flush(s, sheaf);
+			free_empty_sheaf(s, sheaf);
+			sheaf = NULL;
+		}
+	}
+
+	if (sheaf)
+		sheaf->capacity = s->sheaf_capacity;
+
+	return sheaf;
+}
+
+/*
+ * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf()
+ * It tries to refill the sheaf back to the cache's sheaf_capacity
+ * to avoid handling partially full sheaves.
+ *
+ * If the refill fails because gfp is e.g. GFP_NOWAIT, the sheaf is
+ * instead dissolved
+ */
+void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
+			     struct slab_sheaf *sheaf)
+{
+	struct slub_percpu_sheaves *pcs;
+	bool refill = false;
+	struct node_barn *barn;
+
+	if (unlikely(sheaf->capacity != s->sheaf_capacity)) {
+		sheaf_flush(s, sheaf);
+		kfree(sheaf);
+		return;
+	}
+
+	localtry_lock(&s->cpu_sheaves->lock);
+	pcs = this_cpu_ptr(s->cpu_sheaves);
+
+	if (!pcs->spare) {
+		pcs->spare = sheaf;
+		sheaf = NULL;
+	} else if (pcs->barn->nr_full >= MAX_FULL_SHEAVES) {
+		/* racy check */
+		barn = pcs->barn;
+		refill = true;
+	}
+
+	localtry_unlock(&s->cpu_sheaves->lock);
+
+	if (!sheaf)
+		return;
+
+	/*
+	 * if the barn is full of full sheaves or we fail to refill the sheaf,
+	 * simply flush and free it
+	 */
+	if (!refill || refill_sheaf(s, sheaf, gfp)) {
+		sheaf_flush(s, sheaf);
+		free_empty_sheaf(s, sheaf);
+		return;
+	}
+
+	/* we racily determined the sheaf would fit, so now force it */
+	barn_put_full_sheaf(barn, sheaf, true);
+}
+
+/*
+ * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least
+ * the given size
+ *
+ * the sheaf might be replaced by a new one when requesting more than
+ * s->sheaf_capacity objects if such replacement is necessary, but the refill
+ * fails (with -ENOMEM), the existing sheaf is left intact
+ */
+int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
+			    struct slab_sheaf **sheafp, unsigned int size)
+{
+	struct slab_sheaf *sheaf;
+
+	/*
+	 * TODO: do we want to support *sheaf == NULL to be equivalent of
+	 * kmem_cache_prefill_sheaf() ?
+	 */
+	if (!sheafp || !(*sheafp))
+		return -EINVAL;
+
+	sheaf = *sheafp;
+	if (sheaf->size >= size)
+		return 0;
+
+	if (likely(sheaf->capacity >= size)) {
+		if (likely(sheaf->capacity == s->sheaf_capacity))
+			return refill_sheaf(s, sheaf, gfp);
+
+		if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
+					     &sheaf->objects[sheaf->size])) {
+			return -ENOMEM;
+		}
+		sheaf->size = sheaf->capacity;
+
+		return 0;
+	}
+
+	/*
+	 * We had a regular sized sheaf and need an oversize one, or we had an
+	 * oversize one already but need a larger one now.
+	 * This should be a very rare path so let's not complicate it.
+	 */
+	sheaf = kmem_cache_prefill_sheaf(s, gfp, size);
+	if (!sheaf)
+		return -ENOMEM;
+
+	kmem_cache_return_sheaf(s, gfp, *sheafp);
+	*sheafp = sheaf;
+	return 0;
+}
+
+/*
+ * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf()
+ *
+ * Guaranteed not to fail as many allocations as was the requested size.
+ * After the sheaf is emptied, it fails - no fallback to the slab cache itself.
+ *
+ * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT
+ * memcg charging is forced over limit if necessary, to avoid failure.
+ */
+void *
+kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp,
+				   struct slab_sheaf *sheaf)
+{
+	void *ret = NULL;
+	bool init;
+
+	if (sheaf->size == 0)
+		goto out;
+
+	ret = sheaf->objects[--sheaf->size];
+
+	init = slab_want_init_on_alloc(gfp, s);
+
+	/* add __GFP_NOFAIL to force successful memcg charging */
+	slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size);
+out:
+	trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE);
+
+	return ret;
+}
+
+unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf)
+{
+	return sheaf->size;
+}
 /*
  * To avoid unnecessary overhead, we pass through large allocation requests
  * directly to the page allocator. We use __GFP_COMP, because we will need to

-- 
2.48.1

Re: [PATCH RFC v2 06/10] slab: sheaf prefilling for guaranteed allocations

Posted by Harry Yoo 11 months, 2 weeks ago

On Fri, Feb 14, 2025 at 05:27:42PM +0100, Vlastimil Babka wrote:
> Add functions for efficient guaranteed allocations e.g. in a critical
> section that cannot sleep, when the exact number of allocations is not
> known beforehand, but an upper limit can be calculated.
> 
> kmem_cache_prefill_sheaf() returns a sheaf containing at least given
> number of objects.
> 
> kmem_cache_alloc_from_sheaf() will allocate an object from the sheaf
> and is guaranteed not to fail until depleted.
> 
> kmem_cache_return_sheaf() is for giving the sheaf back to the slab
> allocator after the critical section. This will also attempt to refill
> it to cache's sheaf capacity for better efficiency of sheaves handling,
> but it's not stricly necessary to succeed.
> 
> kmem_cache_refill_sheaf() can be used to refill a previously obtained
> sheaf to requested size. If the current size is sufficient, it does
> nothing. If the requested size exceeds cache's sheaf_capacity and the
> sheaf's current capacity, the sheaf will be replaced with a new one,
> hence the indirect pointer parameter.
> 
> kmem_cache_sheaf_size() can be used to query the current size.
> 
> The implementation supports requesting sizes that exceed cache's
> sheaf_capacity, but it is not efficient - such sheaves are allocated
> fresh in kmem_cache_prefill_sheaf() and flushed and freed immediately by
> kmem_cache_return_sheaf(). kmem_cache_refill_sheaf() might be expecially
> ineffective when replacing a sheaf with a new one of a larger capacity.
> It is therefore better to size cache's sheaf_capacity accordingly.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  include/linux/slab.h |  16 ++++
>  mm/slub.c            | 227 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 243 insertions(+)

[... snip ... ]

> @@ -4831,6 +4857,207 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod
>  }
>  EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
>  
> +
> +/*
> + * returns a sheaf that has least the requested size
> + * when prefilling is needed, do so with given gfp flags
> + *
> + * return NULL if sheaf allocation or prefilling failed
> + */
> +struct slab_sheaf *
> +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
> +{
> +	struct slub_percpu_sheaves *pcs;
> +	struct slab_sheaf *sheaf = NULL;
> +
> +	if (unlikely(size > s->sheaf_capacity)) {
> +		sheaf = kzalloc(struct_size(sheaf, objects, size), gfp);
> +		if (!sheaf)
> +			return NULL;
> +
> +		sheaf->cache = s;
> +		sheaf->capacity = size;
> +
> +		if (!__kmem_cache_alloc_bulk(s, gfp, size,
> +					     &sheaf->objects[0])) {
> +			kfree(sheaf);
> +			return NULL;
> +		}
> +
> +		sheaf->size = size;
> +
> +		return sheaf;
> +	}
> +
> +	localtry_lock(&s->cpu_sheaves->lock);
> +	pcs = this_cpu_ptr(s->cpu_sheaves);
> +
> +	if (pcs->spare) {
> +		sheaf = pcs->spare;
> +		pcs->spare = NULL;
> +	}
> +
> +	if (!sheaf)
> +		sheaf = barn_get_full_or_empty_sheaf(pcs->barn);

Can this be outside localtry lock?

> +
> +	localtry_unlock(&s->cpu_sheaves->lock);
> +
> +	if (!sheaf) {
> +		sheaf = alloc_empty_sheaf(s, gfp);
> +	}
> +
> +	if (sheaf && sheaf->size < size) {
> +		if (refill_sheaf(s, sheaf, gfp)) {
> +			sheaf_flush(s, sheaf);
> +			free_empty_sheaf(s, sheaf);
> +			sheaf = NULL;
> +		}
> +	}
> +
> +	if (sheaf)
> +		sheaf->capacity = s->sheaf_capacity;
> +
> +	return sheaf;
> +}
> +
> +/*
> + * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf()
> + * It tries to refill the sheaf back to the cache's sheaf_capacity
> + * to avoid handling partially full sheaves.
> + *
> + * If the refill fails because gfp is e.g. GFP_NOWAIT, the sheaf is
> + * instead dissolved
> + */
> +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
> +			     struct slab_sheaf *sheaf)
> +{
> +	struct slub_percpu_sheaves *pcs;
> +	bool refill = false;
> +	struct node_barn *barn;
> +
> +	if (unlikely(sheaf->capacity != s->sheaf_capacity)) {
> +		sheaf_flush(s, sheaf);
> +		kfree(sheaf);
> +		return;
> +	}
> +
> +	localtry_lock(&s->cpu_sheaves->lock);
> +	pcs = this_cpu_ptr(s->cpu_sheaves);
> +
> +	if (!pcs->spare) {
> +		pcs->spare = sheaf;
> +		sheaf = NULL;
> +	} else if (pcs->barn->nr_full >= MAX_FULL_SHEAVES) {

Did you mean (pcs->barn->nr_full < MAX_FULL_SHEAVES)?

Otherwise looks good to me.

-- 
Cheers,
Harry

> +		/* racy check */
> +		barn = pcs->barn;
> +		refill = true;
> +	}
> +
> +	localtry_unlock(&s->cpu_sheaves->lock);
> +
> +	if (!sheaf)
> +		return;
> +
> +	/*
> +	 * if the barn is full of full sheaves or we fail to refill the sheaf,
> +	 * simply flush and free it
> +	 */
> +	if (!refill || refill_sheaf(s, sheaf, gfp)) {
> +		sheaf_flush(s, sheaf);
> +		free_empty_sheaf(s, sheaf);
> +		return;
> +	}
> +
> +	/* we racily determined the sheaf would fit, so now force it */
> +	barn_put_full_sheaf(barn, sheaf, true);
> +}

Re: [PATCH RFC v2 06/10] slab: sheaf prefilling for guaranteed allocations

Posted by Vlastimil Babka 11 months ago

On 2/25/25 09:00, Harry Yoo wrote:
> On Fri, Feb 14, 2025 at 05:27:42PM +0100, Vlastimil Babka wrote:
>> Add functions for efficient guaranteed allocations e.g. in a critical
>> section that cannot sleep, when the exact number of allocations is not
>> known beforehand, but an upper limit can be calculated.
>> 
>> kmem_cache_prefill_sheaf() returns a sheaf containing at least given
>> number of objects.
>> 
>> kmem_cache_alloc_from_sheaf() will allocate an object from the sheaf
>> and is guaranteed not to fail until depleted.
>> 
>> kmem_cache_return_sheaf() is for giving the sheaf back to the slab
>> allocator after the critical section. This will also attempt to refill
>> it to cache's sheaf capacity for better efficiency of sheaves handling,
>> but it's not stricly necessary to succeed.
>> 
>> kmem_cache_refill_sheaf() can be used to refill a previously obtained
>> sheaf to requested size. If the current size is sufficient, it does
>> nothing. If the requested size exceeds cache's sheaf_capacity and the
>> sheaf's current capacity, the sheaf will be replaced with a new one,
>> hence the indirect pointer parameter.
>> 
>> kmem_cache_sheaf_size() can be used to query the current size.
>> 
>> The implementation supports requesting sizes that exceed cache's
>> sheaf_capacity, but it is not efficient - such sheaves are allocated
>> fresh in kmem_cache_prefill_sheaf() and flushed and freed immediately by
>> kmem_cache_return_sheaf(). kmem_cache_refill_sheaf() might be expecially
>> ineffective when replacing a sheaf with a new one of a larger capacity.
>> It is therefore better to size cache's sheaf_capacity accordingly.
>> 
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> ---
>>  include/linux/slab.h |  16 ++++
>>  mm/slub.c            | 227 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 243 insertions(+)
> 
> [... snip ... ]
> 
>> @@ -4831,6 +4857,207 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod
>>  }
>>  EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
>>  
>> +
>> +/*
>> + * returns a sheaf that has least the requested size
>> + * when prefilling is needed, do so with given gfp flags
>> + *
>> + * return NULL if sheaf allocation or prefilling failed
>> + */
>> +struct slab_sheaf *
>> +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
>> +{
>> +	struct slub_percpu_sheaves *pcs;
>> +	struct slab_sheaf *sheaf = NULL;
>> +
>> +	if (unlikely(size > s->sheaf_capacity)) {
>> +		sheaf = kzalloc(struct_size(sheaf, objects, size), gfp);
>> +		if (!sheaf)
>> +			return NULL;
>> +
>> +		sheaf->cache = s;
>> +		sheaf->capacity = size;
>> +
>> +		if (!__kmem_cache_alloc_bulk(s, gfp, size,
>> +					     &sheaf->objects[0])) {
>> +			kfree(sheaf);
>> +			return NULL;
>> +		}
>> +
>> +		sheaf->size = size;
>> +
>> +		return sheaf;
>> +	}
>> +
>> +	localtry_lock(&s->cpu_sheaves->lock);
>> +	pcs = this_cpu_ptr(s->cpu_sheaves);
>> +
>> +	if (pcs->spare) {
>> +		sheaf = pcs->spare;
>> +		pcs->spare = NULL;
>> +	}
>> +
>> +	if (!sheaf)
>> +		sheaf = barn_get_full_or_empty_sheaf(pcs->barn);
> 
> Can this be outside localtry lock?

Strictly speaking we'd have to save the barn pointer first, otherwise cpu
hotremove could bite us, I think. But not worth the trouble, as localtry
lock is just disabling preemption and taking the barn lock would disable
irqs anyway. So we're not increasing contention by holding the localtry lock
more than strictly necessary.

> 
>> +
>> +	localtry_unlock(&s->cpu_sheaves->lock);
>> +
>> +	if (!sheaf) {
>> +		sheaf = alloc_empty_sheaf(s, gfp);
>> +	}
>> +
>> +	if (sheaf && sheaf->size < size) {
>> +		if (refill_sheaf(s, sheaf, gfp)) {
>> +			sheaf_flush(s, sheaf);
>> +			free_empty_sheaf(s, sheaf);
>> +			sheaf = NULL;
>> +		}
>> +	}
>> +
>> +	if (sheaf)
>> +		sheaf->capacity = s->sheaf_capacity;
>> +
>> +	return sheaf;
>> +}
>> +
>> +/*
>> + * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf()
>> + * It tries to refill the sheaf back to the cache's sheaf_capacity
>> + * to avoid handling partially full sheaves.
>> + *
>> + * If the refill fails because gfp is e.g. GFP_NOWAIT, the sheaf is
>> + * instead dissolved
>> + */
>> +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
>> +			     struct slab_sheaf *sheaf)
>> +{
>> +	struct slub_percpu_sheaves *pcs;
>> +	bool refill = false;
>> +	struct node_barn *barn;
>> +
>> +	if (unlikely(sheaf->capacity != s->sheaf_capacity)) {
>> +		sheaf_flush(s, sheaf);
>> +		kfree(sheaf);
>> +		return;
>> +	}
>> +
>> +	localtry_lock(&s->cpu_sheaves->lock);
>> +	pcs = this_cpu_ptr(s->cpu_sheaves);
>> +
>> +	if (!pcs->spare) {
>> +		pcs->spare = sheaf;
>> +		sheaf = NULL;
>> +	} else if (pcs->barn->nr_full >= MAX_FULL_SHEAVES) {
> 
> Did you mean (pcs->barn->nr_full < MAX_FULL_SHEAVES)?

Oops yeah, fixing this can potentially improve performance.

> Otherwise looks good to me.

Thanks a lot!

Re: [PATCH RFC v2 06/10] slab: sheaf prefilling for guaranteed allocations

Posted by Suren Baghdasaryan 11 months, 3 weeks ago

On Fri, Feb 14, 2025 at 8:27 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> Add functions for efficient guaranteed allocations e.g. in a critical
> section that cannot sleep, when the exact number of allocations is not
> known beforehand, but an upper limit can be calculated.
>
> kmem_cache_prefill_sheaf() returns a sheaf containing at least given
> number of objects.
>
> kmem_cache_alloc_from_sheaf() will allocate an object from the sheaf
> and is guaranteed not to fail until depleted.
>
> kmem_cache_return_sheaf() is for giving the sheaf back to the slab
> allocator after the critical section. This will also attempt to refill
> it to cache's sheaf capacity for better efficiency of sheaves handling,
> but it's not stricly necessary to succeed.
>
> kmem_cache_refill_sheaf() can be used to refill a previously obtained
> sheaf to requested size. If the current size is sufficient, it does
> nothing. If the requested size exceeds cache's sheaf_capacity and the
> sheaf's current capacity, the sheaf will be replaced with a new one,
> hence the indirect pointer parameter.
>
> kmem_cache_sheaf_size() can be used to query the current size.
>
> The implementation supports requesting sizes that exceed cache's
> sheaf_capacity, but it is not efficient - such sheaves are allocated
> fresh in kmem_cache_prefill_sheaf() and flushed and freed immediately by
> kmem_cache_return_sheaf(). kmem_cache_refill_sheaf() might be expecially

s/expecially/especially

> ineffective when replacing a sheaf with a new one of a larger capacity.
> It is therefore better to size cache's sheaf_capacity accordingly.

If support for sizes exceeding sheaf_capacity adds much complexity
with no performance benefits, I think it would be ok not to support
them at all. Users know the capacity of a particular kmem_cache, so
they can use this API only when their needs are within sheaf_capacity,
otherwise either size the sheaf appropriately or use slab bulk
allocation.

>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Reviewed-by: Suren Baghdasaryan <surenb@google.com>

> ---
>  include/linux/slab.h |  16 ++++
>  mm/slub.c            | 227 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 243 insertions(+)
>
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 0e1b25228c77140d05b5b4433c9d7923de36ec05..dd01b67982e856b1b02f4f0e6fc557726e7f02a8 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -829,6 +829,22 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags,
>                                    int node) __assume_slab_alignment __malloc;
>  #define kmem_cache_alloc_node(...)     alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__))
>
> +struct slab_sheaf *
> +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size);
> +
> +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
> +               struct slab_sheaf **sheafp, unsigned int size);
> +
> +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
> +                                      struct slab_sheaf *sheaf);
> +
> +void *kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *cachep, gfp_t gfp,
> +                       struct slab_sheaf *sheaf) __assume_slab_alignment __malloc;
> +#define kmem_cache_alloc_from_sheaf(...)       \
> +                       alloc_hooks(kmem_cache_alloc_from_sheaf_noprof(__VA_ARGS__))
> +
> +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf);
> +
>  /*
>   * These macros allow declaring a kmem_buckets * parameter alongside size, which
>   * can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call
> diff --git a/mm/slub.c b/mm/slub.c
> index 3d7345e7e938d53950ed0d6abe8eb0e93cf8f5b1..c1df7cf22267f28f743404531bef921e25fac086 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -443,6 +443,8 @@ struct slab_sheaf {
>         union {
>                 struct rcu_head rcu_head;
>                 struct list_head barn_list;
> +               /* only used for prefilled sheafs */
> +               unsigned int capacity;
>         };
>         struct kmem_cache *cache;
>         unsigned int size;
> @@ -2735,6 +2737,30 @@ static int barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf,
>         return ret;
>  }
>
> +static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn)
> +{
> +       struct slab_sheaf *sheaf = NULL;
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&barn->lock, flags);
> +
> +       if (barn->nr_full) {
> +               sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
> +                                       barn_list);
> +               list_del(&sheaf->barn_list);
> +               barn->nr_full--;
> +       } else if (barn->nr_empty) {
> +               sheaf = list_first_entry(&barn->sheaves_empty,
> +                                        struct slab_sheaf, barn_list);
> +               list_del(&sheaf->barn_list);
> +               barn->nr_empty--;
> +       }
> +
> +       spin_unlock_irqrestore(&barn->lock, flags);
> +
> +       return sheaf;
> +}
> +
>  /*
>   * If a full sheaf is available, return it and put the supplied empty one to
>   * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't
> @@ -4831,6 +4857,207 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod
>  }
>  EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
>
> +
> +/*
> + * returns a sheaf that has least the requested size
> + * when prefilling is needed, do so with given gfp flags
> + *
> + * return NULL if sheaf allocation or prefilling failed
> + */
> +struct slab_sheaf *
> +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
> +{
> +       struct slub_percpu_sheaves *pcs;
> +       struct slab_sheaf *sheaf = NULL;
> +
> +       if (unlikely(size > s->sheaf_capacity)) {
> +               sheaf = kzalloc(struct_size(sheaf, objects, size), gfp);
> +               if (!sheaf)
> +                       return NULL;
> +
> +               sheaf->cache = s;
> +               sheaf->capacity = size;

After reviewing the code I would advocate that we support only shaves
of s->sheaf_capacity, unless we have a real usecase requiring
sheaf->capacity != s->sheaf_capacity.

> +
> +               if (!__kmem_cache_alloc_bulk(s, gfp, size,
> +                                            &sheaf->objects[0])) {
> +                       kfree(sheaf);
> +                       return NULL;
> +               }
> +
> +               sheaf->size = size;
> +
> +               return sheaf;
> +       }
> +
> +       localtry_lock(&s->cpu_sheaves->lock);
> +       pcs = this_cpu_ptr(s->cpu_sheaves);
> +
> +       if (pcs->spare) {
> +               sheaf = pcs->spare;
> +               pcs->spare = NULL;
> +       }
> +
> +       if (!sheaf)
> +               sheaf = barn_get_full_or_empty_sheaf(pcs->barn);
> +
> +       localtry_unlock(&s->cpu_sheaves->lock);
> +
> +       if (!sheaf) {
> +               sheaf = alloc_empty_sheaf(s, gfp);
> +       }
> +
> +       if (sheaf && sheaf->size < size) {
> +               if (refill_sheaf(s, sheaf, gfp)) {
> +                       sheaf_flush(s, sheaf);
> +                       free_empty_sheaf(s, sheaf);
> +                       sheaf = NULL;
> +               }
> +       }
> +
> +       if (sheaf)
> +               sheaf->capacity = s->sheaf_capacity;
> +
> +       return sheaf;
> +}
> +
> +/*
> + * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf()
> + * It tries to refill the sheaf back to the cache's sheaf_capacity
> + * to avoid handling partially full sheaves.
> + *
> + * If the refill fails because gfp is e.g. GFP_NOWAIT, the sheaf is
> + * instead dissolved

Refilling the sheaf here assumes that in the future we are more likely
to allocate than to free objects or shrink the slab. If the reverse is
true then it would make sense to flush the sheaf and add it as an
empty one into the barn. The fact that flushing can't fail would be
another advantage... We don't know the future but should we be
predicting a more costly case?

> + */
> +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
> +                            struct slab_sheaf *sheaf)
> +{
> +       struct slub_percpu_sheaves *pcs;
> +       bool refill = false;
> +       struct node_barn *barn;
> +
> +       if (unlikely(sheaf->capacity != s->sheaf_capacity)) {
> +               sheaf_flush(s, sheaf);
> +               kfree(sheaf);
> +               return;
> +       }
> +
> +       localtry_lock(&s->cpu_sheaves->lock);
> +       pcs = this_cpu_ptr(s->cpu_sheaves);
> +
> +       if (!pcs->spare) {
> +               pcs->spare = sheaf;
> +               sheaf = NULL;
> +       } else if (pcs->barn->nr_full >= MAX_FULL_SHEAVES) {
> +               /* racy check */
> +               barn = pcs->barn;
> +               refill = true;
> +       }
> +
> +       localtry_unlock(&s->cpu_sheaves->lock);
> +
> +       if (!sheaf)
> +               return;
> +
> +       /*
> +        * if the barn is full of full sheaves or we fail to refill the sheaf,
> +        * simply flush and free it
> +        */
> +       if (!refill || refill_sheaf(s, sheaf, gfp)) {
> +               sheaf_flush(s, sheaf);
> +               free_empty_sheaf(s, sheaf);
> +               return;
> +       }
> +
> +       /* we racily determined the sheaf would fit, so now force it */
> +       barn_put_full_sheaf(barn, sheaf, true);
> +}
> +
> +/*
> + * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least
> + * the given size
> + *
> + * the sheaf might be replaced by a new one when requesting more than
> + * s->sheaf_capacity objects if such replacement is necessary, but the refill
> + * fails (with -ENOMEM), the existing sheaf is left intact
> + */
> +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
> +                           struct slab_sheaf **sheafp, unsigned int size)
> +{
> +       struct slab_sheaf *sheaf;
> +
> +       /*
> +        * TODO: do we want to support *sheaf == NULL to be equivalent of
> +        * kmem_cache_prefill_sheaf() ?
> +        */
> +       if (!sheafp || !(*sheafp))
> +               return -EINVAL;
> +
> +       sheaf = *sheafp;
> +       if (sheaf->size >= size)
> +               return 0;
> +
> +       if (likely(sheaf->capacity >= size)) {
> +               if (likely(sheaf->capacity == s->sheaf_capacity))
> +                       return refill_sheaf(s, sheaf, gfp);
> +
> +               if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
> +                                            &sheaf->objects[sheaf->size])) {
> +                       return -ENOMEM;
> +               }
> +               sheaf->size = sheaf->capacity;
> +
> +               return 0;
> +       }
> +
> +       /*
> +        * We had a regular sized sheaf and need an oversize one, or we had an
> +        * oversize one already but need a larger one now.
> +        * This should be a very rare path so let's not complicate it.
> +        */
> +       sheaf = kmem_cache_prefill_sheaf(s, gfp, size);

WIth all the above I think you always end up refilling up to
sheaf->capacity. Not sure if we should mention that in the comment for
this function because your statement about refilling to at least the
given size is still correct.

> +       if (!sheaf)
> +               return -ENOMEM;
> +
> +       kmem_cache_return_sheaf(s, gfp, *sheafp);
> +       *sheafp = sheaf;
> +       return 0;
> +}
> +
> +/*
> + * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf()
> + *
> + * Guaranteed not to fail as many allocations as was the requested size.
> + * After the sheaf is emptied, it fails - no fallback to the slab cache itself.
> + *
> + * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT
> + * memcg charging is forced over limit if necessary, to avoid failure.
> + */
> +void *
> +kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp,
> +                                  struct slab_sheaf *sheaf)
> +{
> +       void *ret = NULL;
> +       bool init;
> +
> +       if (sheaf->size == 0)
> +               goto out;
> +
> +       ret = sheaf->objects[--sheaf->size];
> +
> +       init = slab_want_init_on_alloc(gfp, s);
> +
> +       /* add __GFP_NOFAIL to force successful memcg charging */
> +       slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size);
> +out:
> +       trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE);
> +
> +       return ret;
> +}
> +
> +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf)
> +{
> +       return sheaf->size;
> +}
>  /*
>   * To avoid unnecessary overhead, we pass through large allocation requests
>   * directly to the page allocator. We use __GFP_COMP, because we will need to
>
> --
> 2.48.1
>

Re: [PATCH RFC v2 06/10] slab: sheaf prefilling for guaranteed allocations

Posted by Vlastimil Babka 11 months ago

On 2/23/25 04:54, Suren Baghdasaryan wrote:
> On Fri, Feb 14, 2025 at 8:27 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>>
>> Add functions for efficient guaranteed allocations e.g. in a critical
>> section that cannot sleep, when the exact number of allocations is not
>> known beforehand, but an upper limit can be calculated.
>>
>> kmem_cache_prefill_sheaf() returns a sheaf containing at least given
>> number of objects.
>>
>> kmem_cache_alloc_from_sheaf() will allocate an object from the sheaf
>> and is guaranteed not to fail until depleted.
>>
>> kmem_cache_return_sheaf() is for giving the sheaf back to the slab
>> allocator after the critical section. This will also attempt to refill
>> it to cache's sheaf capacity for better efficiency of sheaves handling,
>> but it's not stricly necessary to succeed.
>>
>> kmem_cache_refill_sheaf() can be used to refill a previously obtained
>> sheaf to requested size. If the current size is sufficient, it does
>> nothing. If the requested size exceeds cache's sheaf_capacity and the
>> sheaf's current capacity, the sheaf will be replaced with a new one,
>> hence the indirect pointer parameter.
>>
>> kmem_cache_sheaf_size() can be used to query the current size.
>>
>> The implementation supports requesting sizes that exceed cache's
>> sheaf_capacity, but it is not efficient - such sheaves are allocated
>> fresh in kmem_cache_prefill_sheaf() and flushed and freed immediately by
>> kmem_cache_return_sheaf(). kmem_cache_refill_sheaf() might be expecially
> 
> s/expecially/especially
> 
>> ineffective when replacing a sheaf with a new one of a larger capacity.
>> It is therefore better to size cache's sheaf_capacity accordingly.
> 
> If support for sizes exceeding sheaf_capacity adds much complexity
> with no performance benefits, I think it would be ok not to support
> them at all. Users know the capacity of a particular kmem_cache, so
> they can use this API only when their needs are within sheaf_capacity,
> otherwise either size the sheaf appropriately or use slab bulk
> allocation.

As Harry explained, the users (e.g. maple tree) would have to implement the
fallback for unusual situations instead, so it's better to implement it just
once here.

>>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> 
> Reviewed-by: Suren Baghdasaryan <surenb@google.com>

Thanks.

>> +/*
>> + * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf()
>> + * It tries to refill the sheaf back to the cache's sheaf_capacity
>> + * to avoid handling partially full sheaves.
>> + *
>> + * If the refill fails because gfp is e.g. GFP_NOWAIT, the sheaf is
>> + * instead dissolved
> 
> Refilling the sheaf here assumes that in the future we are more likely
> to allocate than to free objects or shrink the slab. If the reverse is
> true then it would make sense to flush the sheaf and add it as an
> empty one into the barn. The fact that flushing can't fail would be
> another advantage... We don't know the future but should we be
> predicting a more costly case?

What the comment doesn't say is we first try to make the sheaf become
pcs->spare without any refill. This is the ideal scenario if nobody
interrupts us between prefill (we grab the spare) and return (we return the
spare).

Also the refill is only attempted if the barn can accept full sheaf.

I have clarified the comment.

Maybe we could make the decision to flush e.g. if the sheaf is below half of
the capacity, but that can be subject to further performance evaluation.

>> + */
>> +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
>> +                            struct slab_sheaf *sheaf)
>> +{
>> +       struct slub_percpu_sheaves *pcs;
>> +       bool refill = false;
>> +       struct node_barn *barn;
>> +
>> +       if (unlikely(sheaf->capacity != s->sheaf_capacity)) {
>> +               sheaf_flush(s, sheaf);
>> +               kfree(sheaf);
>> +               return;
>> +       }
>> +
>> +       localtry_lock(&s->cpu_sheaves->lock);
>> +       pcs = this_cpu_ptr(s->cpu_sheaves);
>> +
>> +       if (!pcs->spare) {
>> +               pcs->spare = sheaf;
>> +               sheaf = NULL;
>> +       } else if (pcs->barn->nr_full >= MAX_FULL_SHEAVES) {
>> +               /* racy check */
>> +               barn = pcs->barn;
>> +               refill = true;
>> +       }
>> +
>> +       localtry_unlock(&s->cpu_sheaves->lock);
>> +
>> +       if (!sheaf)
>> +               return;
>> +
>> +       /*
>> +        * if the barn is full of full sheaves or we fail to refill the sheaf,
>> +        * simply flush and free it
>> +        */
>> +       if (!refill || refill_sheaf(s, sheaf, gfp)) {
>> +               sheaf_flush(s, sheaf);
>> +               free_empty_sheaf(s, sheaf);
>> +               return;
>> +       }
>> +
>> +       /* we racily determined the sheaf would fit, so now force it */
>> +       barn_put_full_sheaf(barn, sheaf, true);
>> +}
>> +
>> +/*
>> + * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least
>> + * the given size
>> + *
>> + * the sheaf might be replaced by a new one when requesting more than
>> + * s->sheaf_capacity objects if such replacement is necessary, but the refill
>> + * fails (with -ENOMEM), the existing sheaf is left intact
>> + */
>> +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
>> +                           struct slab_sheaf **sheafp, unsigned int size)
>> +{
>> +       struct slab_sheaf *sheaf;
>> +
>> +       /*
>> +        * TODO: do we want to support *sheaf == NULL to be equivalent of
>> +        * kmem_cache_prefill_sheaf() ?
>> +        */
>> +       if (!sheafp || !(*sheafp))
>> +               return -EINVAL;
>> +
>> +       sheaf = *sheafp;
>> +       if (sheaf->size >= size)
>> +               return 0;
>> +
>> +       if (likely(sheaf->capacity >= size)) {
>> +               if (likely(sheaf->capacity == s->sheaf_capacity))
>> +                       return refill_sheaf(s, sheaf, gfp);
>> +
>> +               if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
>> +                                            &sheaf->objects[sheaf->size])) {
>> +                       return -ENOMEM;
>> +               }
>> +               sheaf->size = sheaf->capacity;
>> +
>> +               return 0;
>> +       }
>> +
>> +       /*
>> +        * We had a regular sized sheaf and need an oversize one, or we had an
>> +        * oversize one already but need a larger one now.
>> +        * This should be a very rare path so let's not complicate it.
>> +        */
>> +       sheaf = kmem_cache_prefill_sheaf(s, gfp, size);
> 
> WIth all the above I think you always end up refilling up to
> sheaf->capacity. Not sure if we should mention that in the comment for
> this function because your statement about refilling to at least the
> given size is still correct.

OK mentioned it in the comment.

Re: [PATCH RFC v2 06/10] slab: sheaf prefilling for guaranteed allocations

Posted by Harry Yoo 11 months, 2 weeks ago

On Sat, Feb 22, 2025 at 07:54:16PM -0800, Suren Baghdasaryan wrote:
> On Fri, Feb 14, 2025 at 8:27 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> >
> > Add functions for efficient guaranteed allocations e.g. in a critical
> > section that cannot sleep, when the exact number of allocations is not
> > known beforehand, but an upper limit can be calculated.
> >
> > kmem_cache_prefill_sheaf() returns a sheaf containing at least given
> > number of objects.
> >
> > kmem_cache_alloc_from_sheaf() will allocate an object from the sheaf
> > and is guaranteed not to fail until depleted.
> >
> > kmem_cache_return_sheaf() is for giving the sheaf back to the slab
> > allocator after the critical section. This will also attempt to refill
> > it to cache's sheaf capacity for better efficiency of sheaves handling,
> > but it's not stricly necessary to succeed.
> >
> > kmem_cache_refill_sheaf() can be used to refill a previously obtained
> > sheaf to requested size. If the current size is sufficient, it does
> > nothing. If the requested size exceeds cache's sheaf_capacity and the
> > sheaf's current capacity, the sheaf will be replaced with a new one,
> > hence the indirect pointer parameter.
> >
> > kmem_cache_sheaf_size() can be used to query the current size.
> >
> > The implementation supports requesting sizes that exceed cache's
> > sheaf_capacity, but it is not efficient - such sheaves are allocated
> > fresh in kmem_cache_prefill_sheaf() and flushed and freed immediately by
> > kmem_cache_return_sheaf(). kmem_cache_refill_sheaf() might be expecially
> 
> s/expecially/especially
> 
> > ineffective when replacing a sheaf with a new one of a larger capacity.
> > It is therefore better to size cache's sheaf_capacity accordingly.
> 
> If support for sizes exceeding sheaf_capacity adds much complexity
> with no performance benefits, I think it would be ok not to support
> them at all. Users know the capacity of a particular kmem_cache, so
> they can use this API only when their needs are within sheaf_capacity,
> otherwise either size the sheaf appropriately or use slab bulk
> allocation.

At least for maple tree, I think the reason why it support varying size
(that may exceed sheaf_capacity) of sheaves is because the upper limit depends
on the store operation maple tree is going to perform, and height of a maple
tree?

Or can we set a single maximum sheaf capacity that works for any
store operation and any height of maple trees?

Liam may have an opinion on it...

> > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> 
> Reviewed-by: Suren Baghdasaryan <surenb@google.com>

-- 
Cheers,
Harry