[v1] slab: replace cpu (partial) slabs with sheaves

[PATCH RFC 02/19] slab: handle pfmemalloc slabs properly with sheaves

Posted by Vlastimil Babka 3 months, 2 weeks ago

When a pfmemalloc allocation actually dips into reserves, the slab is
marked accordingly and non-pfmemalloc allocations should not be allowed
to allocate from it. The sheaves percpu caching currently doesn't follow
this rule, so implement it before we expand sheaves usage to all caches.

Make sure objects from pfmemalloc slabs don't end up in percpu sheaves.
When freeing, skip sheaves when freeing an object from pfmemalloc slab.
When refilling sheaves, use __GFP_NOMEMALLOC to override any pfmemalloc
context - the allocation will fallback to regular slab allocations when
sheaves are depleted and can't be refilled because of the override.

For kfree_rcu(), detect pfmemalloc slabs after processing the rcu_sheaf
after the grace period in __rcu_free_sheaf_prepare() and simply flush
it if any object is from pfmemalloc slabs.

For prefilled sheaves, try to refill them first with __GFP_NOMEMALLOC
and if it fails, retry without __GFP_NOMEMALLOC but then mark the sheaf
pfmemalloc, which makes it flushed back to slabs when returned.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 51 insertions(+), 14 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 4731b9e461c2..ab03f29dc3bf 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -469,7 +469,10 @@ struct slab_sheaf {
 		struct rcu_head rcu_head;
 		struct list_head barn_list;
 		/* only used for prefilled sheafs */
-		unsigned int capacity;
+		struct {
+			unsigned int capacity;
+			bool pfmemalloc;
+		};
 	};
 	struct kmem_cache *cache;
 	unsigned int size;
@@ -2645,7 +2648,7 @@ static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
 	if (!sheaf)
 		return NULL;
 
-	if (refill_sheaf(s, sheaf, gfp)) {
+	if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC)) {
 		free_empty_sheaf(s, sheaf);
 		return NULL;
 	}
@@ -2723,12 +2726,13 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
 	sheaf->size = 0;
 }
 
-static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
+static bool __rcu_free_sheaf_prepare(struct kmem_cache *s,
 				     struct slab_sheaf *sheaf)
 {
 	bool init = slab_want_init_on_free(s);
 	void **p = &sheaf->objects[0];
 	unsigned int i = 0;
+	bool pfmemalloc = false;
 
 	while (i < sheaf->size) {
 		struct slab *slab = virt_to_slab(p[i]);
@@ -2741,8 +2745,13 @@ static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
 			continue;
 		}
 
+		if (slab_test_pfmemalloc(slab))
+			pfmemalloc = true;
+
 		i++;
 	}
+
+	return pfmemalloc;
 }
 
 static void rcu_free_sheaf_nobarn(struct rcu_head *head)
@@ -5031,7 +5040,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
 		return NULL;
 
 	if (empty) {
-		if (!refill_sheaf(s, empty, gfp)) {
+		if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC)) {
 			full = empty;
 		} else {
 			/*
@@ -5331,6 +5340,26 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
 
+static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
+				      struct slab_sheaf *sheaf, gfp_t gfp)
+{
+	int ret = 0;
+
+	ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC);
+
+	if (likely(!ret || !gfp_pfmemalloc_allowed(gfp)))
+		return ret;
+
+	/*
+	 * if we are allowed to, refill sheaf with pfmemalloc but then remember
+	 * it for when it's returned
+	 */
+	ret = refill_sheaf(s, sheaf, gfp);
+	sheaf->pfmemalloc = true;
+
+	return ret;
+}
+
 /*
  * returns a sheaf that has at least the requested size
  * when prefilling is needed, do so with given gfp flags
@@ -5401,17 +5430,18 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
 	if (!sheaf)
 		sheaf = alloc_empty_sheaf(s, gfp);
 
-	if (sheaf && sheaf->size < size) {
-		if (refill_sheaf(s, sheaf, gfp)) {
+	if (sheaf) {
+		sheaf->capacity = s->sheaf_capacity;
+		sheaf->pfmemalloc = false;
+
+		if (sheaf->size < size &&
+		    __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) {
 			sheaf_flush_unused(s, sheaf);
 			free_empty_sheaf(s, sheaf);
 			sheaf = NULL;
 		}
 	}
 
-	if (sheaf)
-		sheaf->capacity = s->sheaf_capacity;
-
 	return sheaf;
 }
 
@@ -5431,7 +5461,8 @@ void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
 	struct slub_percpu_sheaves *pcs;
 	struct node_barn *barn;
 
-	if (unlikely(sheaf->capacity != s->sheaf_capacity)) {
+	if (unlikely((sheaf->capacity != s->sheaf_capacity)
+		     || sheaf->pfmemalloc)) {
 		sheaf_flush_unused(s, sheaf);
 		kfree(sheaf);
 		return;
@@ -5497,7 +5528,7 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
 
 	if (likely(sheaf->capacity >= size)) {
 		if (likely(sheaf->capacity == s->sheaf_capacity))
-			return refill_sheaf(s, sheaf, gfp);
+			return __prefill_sheaf_pfmemalloc(s, sheaf, gfp);
 
 		if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
 					     &sheaf->objects[sheaf->size])) {
@@ -6177,8 +6208,12 @@ static void rcu_free_sheaf(struct rcu_head *head)
 	 * handles it fine. The only downside is that sheaf will serve fewer
 	 * allocations when reused. It only happens due to debugging, which is a
 	 * performance hit anyway.
+	 *
+	 * If it returns true, there was at least one object from pfmemalloc
+	 * slab so simply flush everything.
 	 */
-	__rcu_free_sheaf_prepare(s, sheaf);
+	if (__rcu_free_sheaf_prepare(s, sheaf))
+		goto flush;
 
 	n = get_node(s, sheaf->node);
 	if (!n)
@@ -6333,7 +6368,8 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 			continue;
 		}
 
-		if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) {
+		if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)
+			     || slab_test_pfmemalloc(slab))) {
 			remote_objects[remote_nr] = p[i];
 			p[i] = p[--size];
 			if (++remote_nr >= PCS_BATCH_MAX)
@@ -6631,7 +6667,8 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
 		return;
 
 	if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) ||
-				     slab_nid(slab) == numa_mem_id())) {
+				     slab_nid(slab) == numa_mem_id())
+			   && likely(!slab_test_pfmemalloc(slab))) {
 		if (likely(free_to_pcs(s, object)))
 			return;
 	}

-- 
2.51.1

Re: [PATCH RFC 02/19] slab: handle pfmemalloc slabs properly with sheaves

Posted by Chris Mason 3 months, 2 weeks ago

On Thu, 23 Oct 2025 15:52:24 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:

> When a pfmemalloc allocation actually dips into reserves, the slab is
> marked accordingly and non-pfmemalloc allocations should not be allowed
> to allocate from it. The sheaves percpu caching currently doesn't follow
> this rule, so implement it before we expand sheaves usage to all caches.
> 
> Make sure objects from pfmemalloc slabs don't end up in percpu sheaves.
> When freeing, skip sheaves when freeing an object from pfmemalloc slab.
> When refilling sheaves, use __GFP_NOMEMALLOC to override any pfmemalloc
> context - the allocation will fallback to regular slab allocations when
> sheaves are depleted and can't be refilled because of the override.
> 
> For kfree_rcu(), detect pfmemalloc slabs after processing the rcu_sheaf
> after the grace period in __rcu_free_sheaf_prepare() and simply flush
> it if any object is from pfmemalloc slabs.
> 
> For prefilled sheaves, try to refill them first with __GFP_NOMEMALLOC
> and if it fails, retry without __GFP_NOMEMALLOC but then mark the sheaf
> pfmemalloc, which makes it flushed back to slabs when returned.
>

Hi Vlastimil,

We're trying out the AI patch review automation on the BPF commits and it had
some questions about a few of these.  Since the goal is to actually test the
automation, I'm putting them in unedited, but I did try and make sure they
were valid before sending.

For this one, I'm not sure if the behavior AI is flagging is intentional or
not, so it feels like a reasonable question to me at least:

> diff --git a/mm/slub.c b/mm/slub.c
> index 4731b9e461c2..ab03f29dc3bf 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c

[ ... ]

> @@ -5331,6 +5340,26 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod
>  }
>  EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
>
> +static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
> +				      struct slab_sheaf *sheaf, gfp_t gfp)
> +{
> +	int ret = 0;
> +
> +	ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC);
> +
> +	if (likely(!ret || !gfp_pfmemalloc_allowed(gfp)))
> +		return ret;
> +
> +	/*
> +	 * if we are allowed to, refill sheaf with pfmemalloc but then remember
> +	 * it for when it's returned
> +	 */
> +	ret = refill_sheaf(s, sheaf, gfp);
> +	sheaf->pfmemalloc = true;
> +
> +	return ret;
> +}

[ ... ]

> @@ -5497,7 +5528,7 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
>
>  	if (likely(sheaf->capacity >= size)) {
>  		if (likely(sheaf->capacity == s->sheaf_capacity))
> -			return refill_sheaf(s, sheaf, gfp);
> +			return __prefill_sheaf_pfmemalloc(s, sheaf, gfp);
>
>  		if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
>  					     &sheaf->objects[sheaf->size])) {
                                             ^^^

In kmem_cache_refill_sheaf(), does the oversize sheaf path (when
sheaf->capacity != s->sheaf_capacity) need __GFP_NOMEMALLOC too?

The commit message says "When refilling sheaves, use __GFP_NOMEMALLOC
to override any pfmemalloc context", and the normal capacity path now
calls __prefill_sheaf_pfmemalloc() which adds __GFP_NOMEMALLOC.

But this oversize path still calls __kmem_cache_alloc_bulk() with the
raw gfp flags. If the calling context is pfmemalloc-enabled (e.g.,
during swap or network operations), could pfmemalloc objects be
allocated into the sheaf? Those objects would then be returned via
kmem_cache_alloc_from_sheaf() to potentially non-pfmemalloc callers.

Similar issue exists in kmem_cache_prefill_sheaf() for the oversize
path at the function's start, though that code wasn't modified by this
patch.

>  			return -ENOMEM;
>  		}
>  		sheaf->size = sheaf->capacity;

[ ... ]

Re: [PATCH RFC 02/19] slab: handle pfmemalloc slabs properly with sheaves

Posted by Vlastimil Babka 3 months, 1 week ago

On 10/24/25 16:21, Chris Mason wrote:
> On Thu, 23 Oct 2025 15:52:24 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
>> @@ -5497,7 +5528,7 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
>>
>>  	if (likely(sheaf->capacity >= size)) {
>>  		if (likely(sheaf->capacity == s->sheaf_capacity))
>> -			return refill_sheaf(s, sheaf, gfp);
>> +			return __prefill_sheaf_pfmemalloc(s, sheaf, gfp);
>>
>>  		if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
>>  					     &sheaf->objects[sheaf->size])) {
>                                              ^^^
> 
> In kmem_cache_refill_sheaf(), does the oversize sheaf path (when
> sheaf->capacity != s->sheaf_capacity) need __GFP_NOMEMALLOC too?
> 
> The commit message says "When refilling sheaves, use __GFP_NOMEMALLOC
> to override any pfmemalloc context", and the normal capacity path now
> calls __prefill_sheaf_pfmemalloc() which adds __GFP_NOMEMALLOC.
> 
> But this oversize path still calls __kmem_cache_alloc_bulk() with the
> raw gfp flags. If the calling context is pfmemalloc-enabled (e.g.,
> during swap or network operations), could pfmemalloc objects be
> allocated into the sheaf? Those objects would then be returned via

Yes.

> kmem_cache_alloc_from_sheaf() to potentially non-pfmemalloc callers.

The assumption is the caller will use the prefilled sheaf for its purposes
and not pass it to other callers. The reason for caring about pfmemalloc and
setting sheaf->pfmemalloc is only to recognize them when the prefilled sheaf
is returned - so that it's flushed+freed and not attached as pcs->spare -
that would then be available to other non-pfmemalloc callers.

But we always flush oversize sheaves when those are returned, so it's not
necessary to also track pfmemalloc for them. I'll add a comment about it.

Thanks,
Vlastimil

> Similar issue exists in kmem_cache_prefill_sheaf() for the oversize
> path at the function's start, though that code wasn't modified by this
> patch.
> 
>>  			return -ENOMEM;
>>  		}
>>  		sheaf->size = sheaf->capacity;
> 
> [ ... ]
> 
>

Re: [PATCH RFC 02/19] slab: handle pfmemalloc slabs properly with sheaves

Posted by Chris Mason 3 months, 1 week ago


On 10/29/25 11:00 AM, Vlastimil Babka wrote:
> On 10/24/25 16:21, Chris Mason wrote:
>> On Thu, 23 Oct 2025 15:52:24 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
>>> @@ -5497,7 +5528,7 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
>>>
>>>  	if (likely(sheaf->capacity >= size)) {
>>>  		if (likely(sheaf->capacity == s->sheaf_capacity))
>>> -			return refill_sheaf(s, sheaf, gfp);
>>> +			return __prefill_sheaf_pfmemalloc(s, sheaf, gfp);
>>>
>>>  		if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
>>>  					     &sheaf->objects[sheaf->size])) {
>>                                              ^^^
>>
>> In kmem_cache_refill_sheaf(), does the oversize sheaf path (when
>> sheaf->capacity != s->sheaf_capacity) need __GFP_NOMEMALLOC too?
>>
>> The commit message says "When refilling sheaves, use __GFP_NOMEMALLOC
>> to override any pfmemalloc context", and the normal capacity path now
>> calls __prefill_sheaf_pfmemalloc() which adds __GFP_NOMEMALLOC.
>>
>> But this oversize path still calls __kmem_cache_alloc_bulk() with the
>> raw gfp flags. If the calling context is pfmemalloc-enabled (e.g.,
>> during swap or network operations), could pfmemalloc objects be
>> allocated into the sheaf? Those objects would then be returned via
> 
> Yes.
> 
>> kmem_cache_alloc_from_sheaf() to potentially non-pfmemalloc callers.
> 
> The assumption is the caller will use the prefilled sheaf for its purposes
> and not pass it to other callers. The reason for caring about pfmemalloc and
> setting sheaf->pfmemalloc is only to recognize them when the prefilled sheaf
> is returned - so that it's flushed+freed and not attached as pcs->spare -
> that would then be available to other non-pfmemalloc callers.
> 
> But we always flush oversize sheaves when those are returned, so it's not
> necessary to also track pfmemalloc for them. I'll add a comment about it.

Oh I see, this makes sense.  Thanks!

-chris