[v3] slab: replace cpu (partial) slabs with sheaves

[PATCH v3 17/21] slab: refill sheaves from all nodes

Posted by Vlastimil Babka 3 weeks, 3 days ago

__refill_objects() currently only attempts to get partial slabs from the
local node and then allocates new slab(s). Expand it to trying also
other nodes while observing the remote node defrag ratio, similarly to
get_any_partial().

This will prevent allocating new slabs on a node while other nodes have
many free slabs. It does mean sheaves will contain non-local objects in
that case. Allocations that care about specific node will still be
served appropriately, but might get a slowpath allocation.

Like get_any_partial() we do observe cpuset_zone_allowed(), although we
might be refilling a sheaf that will be then used from a different
allocation context.

We can also use the resulting refill_objects() in
__kmem_cache_alloc_bulk() for non-debug caches. This means
kmem_cache_alloc_bulk() will get better performance when sheaves are
exhausted. kmem_cache_alloc_bulk() cannot indicate a preferred node so
it's compatible with sheaves refill in preferring the local node.
Its users also have gfp flags that allow spinning, so document that
as a requirement.

Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 106 insertions(+), 31 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index d52de6e3c2d5..2c522d2bf547 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2518,8 +2518,8 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
 }
 
 static unsigned int
-__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
-		 unsigned int max);
+refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+	       unsigned int max);
 
 static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
 			 gfp_t gfp)
@@ -2530,8 +2530,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
 	if (!to_fill)
 		return 0;
 
-	filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp,
-			to_fill, to_fill);
+	filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill,
+				to_fill);
 
 	sheaf->size += filled;
 
@@ -6522,29 +6522,22 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 EXPORT_SYMBOL(kmem_cache_free_bulk);
 
 static unsigned int
-__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
-		 unsigned int max)
+__refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+		      unsigned int max, struct kmem_cache_node *n)
 {
 	struct slab *slab, *slab2;
 	struct partial_context pc;
 	unsigned int refilled = 0;
 	unsigned long flags;
 	void *object;
-	int node;
 
 	pc.flags = gfp;
 	pc.min_objects = min;
 	pc.max_objects = max;
 
-	node = numa_mem_id();
-
-	if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
+	if (!get_partial_node_bulk(s, n, &pc))
 		return 0;
 
-	/* TODO: consider also other nodes? */
-	if (!get_partial_node_bulk(s, get_node(s, node), &pc))
-		goto new_slab;
-
 	list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
 
 		list_del(&slab->slab_list);
@@ -6582,8 +6575,6 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
 	}
 
 	if (unlikely(!list_empty(&pc.slabs))) {
-		struct kmem_cache_node *n = get_node(s, node);
-
 		spin_lock_irqsave(&n->list_lock, flags);
 
 		list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
@@ -6605,13 +6596,92 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
 		}
 	}
 
+	return refilled;
+}
 
-	if (likely(refilled >= min))
-		goto out;
+#ifdef CONFIG_NUMA
+static unsigned int
+__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+		     unsigned int max, int local_node)
+{
+	struct zonelist *zonelist;
+	struct zoneref *z;
+	struct zone *zone;
+	enum zone_type highest_zoneidx = gfp_zone(gfp);
+	unsigned int cpuset_mems_cookie;
+	unsigned int refilled = 0;
+
+	/* see get_any_partial() for the defrag ratio description */
+	if (!s->remote_node_defrag_ratio ||
+			get_cycles() % 1024 > s->remote_node_defrag_ratio)
+		return 0;
+
+	do {
+		cpuset_mems_cookie = read_mems_allowed_begin();
+		zonelist = node_zonelist(mempolicy_slab_node(), gfp);
+		for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
+			struct kmem_cache_node *n;
+			unsigned int r;
+
+			n = get_node(s, zone_to_nid(zone));
+
+			if (!n || !cpuset_zone_allowed(zone, gfp) ||
+					n->nr_partial <= s->min_partial)
+				continue;
+
+			r = __refill_objects_node(s, p, gfp, min, max, n);
+			refilled += r;
+
+			if (r >= min) {
+				/*
+				 * Don't check read_mems_allowed_retry() here -
+				 * if mems_allowed was updated in parallel, that
+				 * was a harmless race between allocation and
+				 * the cpuset update
+				 */
+				return refilled;
+			}
+			p += r;
+			min -= r;
+			max -= r;
+		}
+	} while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+	return refilled;
+}
+#else
+static inline unsigned int
+__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+		     unsigned int max, int local_node)
+{
+	return 0;
+}
+#endif
+
+static unsigned int
+refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+	       unsigned int max)
+{
+	int local_node = numa_mem_id();
+	unsigned int refilled;
+	struct slab *slab;
+
+	if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
+		return 0;
+
+	refilled = __refill_objects_node(s, p, gfp, min, max,
+					 get_node(s, local_node));
+	if (refilled >= min)
+		return refilled;
+
+	refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled,
+					 max - refilled, local_node);
+	if (refilled >= min)
+		return refilled;
 
 new_slab:
 
-	slab = new_slab(s, pc.flags, node);
+	slab = new_slab(s, gfp, local_node);
 	if (!slab)
 		goto out;
 
@@ -6626,8 +6696,8 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
 
 	if (refilled < min)
 		goto new_slab;
-out:
 
+out:
 	return refilled;
 }
 
@@ -6637,18 +6707,20 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 {
 	int i;
 
-	/*
-	 * TODO: this might be more efficient (if necessary) by reusing
-	 * __refill_objects()
-	 */
-	for (i = 0; i < size; i++) {
+	if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
+		for (i = 0; i < size; i++) {
 
-		p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_,
-				     s->object_size);
-		if (unlikely(!p[i]))
-			goto error;
+			p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_,
+					     s->object_size);
+			if (unlikely(!p[i]))
+				goto error;
 
-		maybe_wipe_obj_freeptr(s, p[i]);
+			maybe_wipe_obj_freeptr(s, p[i]);
+		}
+	} else {
+		i = refill_objects(s, p, flags, size, size);
+		if (i < size)
+			goto error;
 	}
 
 	return i;
@@ -6659,7 +6731,10 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 
 }
 
-/* Note that interrupts must be enabled when calling this function. */
+/*
+ * Note that interrupts must be enabled when calling this function and gfp
+ * flags must allow spinning.
+ */
 int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
 				 void **p)
 {

-- 
2.52.0

Re: [PATCH v3 17/21] slab: refill sheaves from all nodes

Posted by Harry Yoo 2 weeks, 5 days ago

On Fri, Jan 16, 2026 at 03:40:37PM +0100, Vlastimil Babka wrote:
> __refill_objects() currently only attempts to get partial slabs from the
> local node and then allocates new slab(s). Expand it to trying also
> other nodes while observing the remote node defrag ratio, similarly to
> get_any_partial().
> 
> This will prevent allocating new slabs on a node while other nodes have
> many free slabs. It does mean sheaves will contain non-local objects in
> that case. Allocations that care about specific node will still be
> served appropriately, but might get a slowpath allocation.

Hmm one more question.

Given frees to remote nodes bypass sheaves layer anyway, isn't it
more reasonable to let refill_objects() fail sometimes instead of
allocating new local slabs and fall back to slowpath (based on defrag_ratio)?

> Like get_any_partial() we do observe cpuset_zone_allowed(), although we
> might be refilling a sheaf that will be then used from a different
> allocation context.
> 
> We can also use the resulting refill_objects() in
> __kmem_cache_alloc_bulk() for non-debug caches. This means
> kmem_cache_alloc_bulk() will get better performance when sheaves are
> exhausted. kmem_cache_alloc_bulk() cannot indicate a preferred node so
> it's compatible with sheaves refill in preferring the local node.
> Its users also have gfp flags that allow spinning, so document that
> as a requirement.
> 
> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---


-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v3 17/21] slab: refill sheaves from all nodes

Posted by Vlastimil Babka 2 weeks, 5 days ago

On 1/22/26 08:02, Harry Yoo wrote:
> On Fri, Jan 16, 2026 at 03:40:37PM +0100, Vlastimil Babka wrote:
>> __refill_objects() currently only attempts to get partial slabs from the
>> local node and then allocates new slab(s). Expand it to trying also
>> other nodes while observing the remote node defrag ratio, similarly to
>> get_any_partial().
>> 
>> This will prevent allocating new slabs on a node while other nodes have
>> many free slabs. It does mean sheaves will contain non-local objects in
>> that case. Allocations that care about specific node will still be
>> served appropriately, but might get a slowpath allocation.
> 
> Hmm one more question.
> 
> Given frees to remote nodes bypass sheaves layer anyway, isn't it
> more reasonable to let refill_objects() fail sometimes instead of
> allocating new local slabs and fall back to slowpath (based on defrag_ratio)?

You mean if we can't refill from local partial list, we give up and perhaps
fail alloc_from_pcs()? Then the __slab_alloc_node() fallback would do
allocate local slab or try remote nodes?

Wouldn't that mean __slab_alloc_node() does all that work for a single
object, and slow everything down? Only in case of a new slab it would
somehow amortize because the next attempt would refill from it.

>> Like get_any_partial() we do observe cpuset_zone_allowed(), although we
>> might be refilling a sheaf that will be then used from a different
>> allocation context.
>> 
>> We can also use the resulting refill_objects() in
>> __kmem_cache_alloc_bulk() for non-debug caches. This means
>> kmem_cache_alloc_bulk() will get better performance when sheaves are
>> exhausted. kmem_cache_alloc_bulk() cannot indicate a preferred node so
>> it's compatible with sheaves refill in preferring the local node.
>> Its users also have gfp flags that allow spinning, so document that
>> as a requirement.
>> 
>> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> ---
> 
>

Re: [PATCH v3 17/21] slab: refill sheaves from all nodes

Posted by Hao Li 2 weeks, 5 days ago

On Fri, Jan 16, 2026 at 03:40:37PM +0100, Vlastimil Babka wrote:
> __refill_objects() currently only attempts to get partial slabs from the
> local node and then allocates new slab(s). Expand it to trying also
> other nodes while observing the remote node defrag ratio, similarly to
> get_any_partial().
> 
> This will prevent allocating new slabs on a node while other nodes have
> many free slabs. It does mean sheaves will contain non-local objects in
> that case. Allocations that care about specific node will still be
> served appropriately, but might get a slowpath allocation.
> 
> Like get_any_partial() we do observe cpuset_zone_allowed(), although we
> might be refilling a sheaf that will be then used from a different
> allocation context.
> 
> We can also use the resulting refill_objects() in
> __kmem_cache_alloc_bulk() for non-debug caches. This means
> kmem_cache_alloc_bulk() will get better performance when sheaves are
> exhausted. kmem_cache_alloc_bulk() cannot indicate a preferred node so
> it's compatible with sheaves refill in preferring the local node.
> Its users also have gfp flags that allow spinning, so document that
> as a requirement.
> 
> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/slub.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 106 insertions(+), 31 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index d52de6e3c2d5..2c522d2bf547 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -2518,8 +2518,8 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
>  }
>  
>  static unsigned int
> -__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> -		 unsigned int max);
> +refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> +	       unsigned int max);
>  
>  static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
>  			 gfp_t gfp)
> @@ -2530,8 +2530,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
>  	if (!to_fill)
>  		return 0;
>  
> -	filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp,
> -			to_fill, to_fill);
> +	filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill,
> +				to_fill);
>  
>  	sheaf->size += filled;
>  
> @@ -6522,29 +6522,22 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
>  EXPORT_SYMBOL(kmem_cache_free_bulk);
>  
>  static unsigned int
> -__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> -		 unsigned int max)
> +__refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> +		      unsigned int max, struct kmem_cache_node *n)
>  {
>  	struct slab *slab, *slab2;
>  	struct partial_context pc;
>  	unsigned int refilled = 0;
>  	unsigned long flags;
>  	void *object;
> -	int node;
>  
>  	pc.flags = gfp;
>  	pc.min_objects = min;
>  	pc.max_objects = max;
>  
> -	node = numa_mem_id();
> -
> -	if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
> +	if (!get_partial_node_bulk(s, n, &pc))
>  		return 0;
>  
> -	/* TODO: consider also other nodes? */
> -	if (!get_partial_node_bulk(s, get_node(s, node), &pc))
> -		goto new_slab;
> -
>  	list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
>  
>  		list_del(&slab->slab_list);
> @@ -6582,8 +6575,6 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
>  	}
>  
>  	if (unlikely(!list_empty(&pc.slabs))) {
> -		struct kmem_cache_node *n = get_node(s, node);
> -
>  		spin_lock_irqsave(&n->list_lock, flags);
>  
>  		list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
> @@ -6605,13 +6596,92 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
>  		}
>  	}
>  
> +	return refilled;
> +}
>  
> -	if (likely(refilled >= min))
> -		goto out;
> +#ifdef CONFIG_NUMA
> +static unsigned int
> +__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> +		     unsigned int max, int local_node)


Just a small note: I noticed that the local_node variable is unused. It seems
the intention was to skip local_node in __refill_objects_any(), since it had
already been attempted in __refill_objects_node().

Everything else looks good.

Reviewed-by: Hao Li <hao.li@linux.dev>

> +{
> +	struct zonelist *zonelist;
> +	struct zoneref *z;
> +	struct zone *zone;
> +	enum zone_type highest_zoneidx = gfp_zone(gfp);
> +	unsigned int cpuset_mems_cookie;
> +	unsigned int refilled = 0;
> +
> +	/* see get_any_partial() for the defrag ratio description */
> +	if (!s->remote_node_defrag_ratio ||
> +			get_cycles() % 1024 > s->remote_node_defrag_ratio)
> +		return 0;
> +
> +	do {
> +		cpuset_mems_cookie = read_mems_allowed_begin();
> +		zonelist = node_zonelist(mempolicy_slab_node(), gfp);
> +		for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
> +			struct kmem_cache_node *n;
> +			unsigned int r;
> +
> +			n = get_node(s, zone_to_nid(zone));
> +
> +			if (!n || !cpuset_zone_allowed(zone, gfp) ||
> +					n->nr_partial <= s->min_partial)
> +				continue;
> +
> +			r = __refill_objects_node(s, p, gfp, min, max, n);
> +			refilled += r;
> +
> +			if (r >= min) {
> +				/*
> +				 * Don't check read_mems_allowed_retry() here -
> +				 * if mems_allowed was updated in parallel, that
> +				 * was a harmless race between allocation and
> +				 * the cpuset update
> +				 */
> +				return refilled;
> +			}
> +			p += r;
> +			min -= r;
> +			max -= r;
> +		}
> +	} while (read_mems_allowed_retry(cpuset_mems_cookie));
> +
> +	return refilled;
> +}
> +#else
> +static inline unsigned int
> +__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> +		     unsigned int max, int local_node)
> +{
> +	return 0;
> +}
> +#endif
> +
> +static unsigned int
> +refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> +	       unsigned int max)
> +{
> +	int local_node = numa_mem_id();
> +	unsigned int refilled;
> +	struct slab *slab;
> +
> +	if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
> +		return 0;
> +
> +	refilled = __refill_objects_node(s, p, gfp, min, max,
> +					 get_node(s, local_node));
> +	if (refilled >= min)
> +		return refilled;
> +
> +	refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled,
> +					 max - refilled, local_node);
> +	if (refilled >= min)
> +		return refilled;
>  
>  new_slab:
>  
> -	slab = new_slab(s, pc.flags, node);
> +	slab = new_slab(s, gfp, local_node);
>  	if (!slab)
>  		goto out;
>  
> @@ -6626,8 +6696,8 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
>  
>  	if (refilled < min)
>  		goto new_slab;
> -out:
>  
> +out:
>  	return refilled;
>  }
>  
> @@ -6637,18 +6707,20 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
>  {
>  	int i;
>  
> -	/*
> -	 * TODO: this might be more efficient (if necessary) by reusing
> -	 * __refill_objects()
> -	 */
> -	for (i = 0; i < size; i++) {
> +	if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
> +		for (i = 0; i < size; i++) {
>  
> -		p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_,
> -				     s->object_size);
> -		if (unlikely(!p[i]))
> -			goto error;
> +			p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_,
> +					     s->object_size);
> +			if (unlikely(!p[i]))
> +				goto error;
>  
> -		maybe_wipe_obj_freeptr(s, p[i]);
> +			maybe_wipe_obj_freeptr(s, p[i]);
> +		}
> +	} else {
> +		i = refill_objects(s, p, flags, size, size);
> +		if (i < size)
> +			goto error;
>  	}
>  
>  	return i;
> @@ -6659,7 +6731,10 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
>  
>  }
>  
> -/* Note that interrupts must be enabled when calling this function. */
> +/*
> + * Note that interrupts must be enabled when calling this function and gfp
> + * flags must allow spinning.
> + */
>  int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
>  				 void **p)
>  {
> 
> -- 
> 2.52.0
>

Re: [PATCH v3 17/21] slab: refill sheaves from all nodes

Posted by Vlastimil Babka 2 weeks, 5 days ago

On 1/22/26 05:58, Hao Li wrote:
> Just a small note: I noticed that the local_node variable is unused. It seems
> the intention was to skip local_node in __refill_objects_any(), since it had
> already been attempted in __refill_objects_node().

Ah, I'll remove it. Such skip wouldn't likely do much.

> Everything else looks good.
> 
> Reviewed-by: Hao Li <hao.li@linux.dev>

Thanks!

Re: [PATCH v3 17/21] slab: refill sheaves from all nodes

Posted by Harry Yoo 2 weeks, 5 days ago

On Fri, Jan 16, 2026 at 03:40:37PM +0100, Vlastimil Babka wrote:
> __refill_objects() currently only attempts to get partial slabs from the
> local node and then allocates new slab(s). Expand it to trying also
> other nodes while observing the remote node defrag ratio, similarly to
> get_any_partial().
> 
> This will prevent allocating new slabs on a node while other nodes have
> many free slabs. It does mean sheaves will contain non-local objects in
> that case. Allocations that care about specific node will still be
> served appropriately, but might get a slowpath allocation.
> 
> Like get_any_partial() we do observe cpuset_zone_allowed(), although we
> might be refilling a sheaf that will be then used from a different
> allocation context.
> 
> We can also use the resulting refill_objects() in
> __kmem_cache_alloc_bulk() for non-debug caches. This means
> kmem_cache_alloc_bulk() will get better performance when sheaves are
> exhausted. kmem_cache_alloc_bulk() cannot indicate a preferred node so
> it's compatible with sheaves refill in preferring the local node.
> Its users also have gfp flags that allow spinning, so document that
> as a requirement.
> 
> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---

Could this cause strict_numa to not work as intended when
the policy is MPOL_BIND?

alloc_from_pcs() has:
> #ifdef CONFIG_NUMA
>         if (static_branch_unlikely(&strict_numa) &&
>                          node == NUMA_NO_NODE) {
>
>                 struct mempolicy *mpol = current->mempolicy;
>
>                 if (mpol) {
>                         /*
>                          * Special BIND rule support. If the local node
>                          * is in permitted set then do not redirect
>                          * to a particular node.
>                          * Otherwise we apply the memory policy to get
>                          * the node we need to allocate on.
>                          */
>                         if (mpol->mode != MPOL_BIND ||
>                                         !node_isset(numa_mem_id(), mpol->nodes))

This assumes the sheaves contain (mostly, although it wasn't strictly
guaranteed) objects from local node, and this change breaks that
assumption.

So... perhaps remove "Special BIND rule support"?

>
>                                 node = mempolicy_slab_node(); 
>                 }
>         }
> #endif

Otherwise LGTM.

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v3 17/21] slab: refill sheaves from all nodes

Posted by Vlastimil Babka 2 weeks, 5 days ago

On 1/22/26 05:44, Harry Yoo wrote:
> On Fri, Jan 16, 2026 at 03:40:37PM +0100, Vlastimil Babka wrote:
>> __refill_objects() currently only attempts to get partial slabs from the
>> local node and then allocates new slab(s). Expand it to trying also
>> other nodes while observing the remote node defrag ratio, similarly to
>> get_any_partial().
>> 
>> This will prevent allocating new slabs on a node while other nodes have
>> many free slabs. It does mean sheaves will contain non-local objects in
>> that case. Allocations that care about specific node will still be
>> served appropriately, but might get a slowpath allocation.
>> 
>> Like get_any_partial() we do observe cpuset_zone_allowed(), although we
>> might be refilling a sheaf that will be then used from a different
>> allocation context.
>> 
>> We can also use the resulting refill_objects() in
>> __kmem_cache_alloc_bulk() for non-debug caches. This means
>> kmem_cache_alloc_bulk() will get better performance when sheaves are
>> exhausted. kmem_cache_alloc_bulk() cannot indicate a preferred node so
>> it's compatible with sheaves refill in preferring the local node.
>> Its users also have gfp flags that allow spinning, so document that
>> as a requirement.
>> 
>> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> ---
> 
> Could this cause strict_numa to not work as intended when
> the policy is MPOL_BIND?

Hm I guess it could be optimized differently later. I assume people running
strict_numa would also tune remote_node_defrag_ratio accordingly and don't
run into this often.

> alloc_from_pcs() has:
>> #ifdef CONFIG_NUMA
>>         if (static_branch_unlikely(&strict_numa) &&
>>                          node == NUMA_NO_NODE) {
>>
>>                 struct mempolicy *mpol = current->mempolicy;
>>
>>                 if (mpol) {
>>                         /*
>>                          * Special BIND rule support. If the local node
>>                          * is in permitted set then do not redirect
>>                          * to a particular node.
>>                          * Otherwise we apply the memory policy to get
>>                          * the node we need to allocate on.
>>                          */
>>                         if (mpol->mode != MPOL_BIND ||
>>                                         !node_isset(numa_mem_id(), mpol->nodes))
> 
> This assumes the sheaves contain (mostly, although it wasn't strictly
> guaranteed) objects from local node, and this change breaks that
> assumption.
> 
> So... perhaps remove "Special BIND rule support"?

Ideally we would check if the object in sheaf is from the permitted nodes
instead of picking the local one. In a way that doesn't make systems with
strict_numa disabled slower :)

>>
>>                                 node = mempolicy_slab_node(); 
>>                 }
>>         }
>> #endif
> 
> Otherwise LGTM.
>

Re: [PATCH v3 17/21] slab: refill sheaves from all nodes

Posted by Suren Baghdasaryan 2 weeks, 5 days ago

On Fri, Jan 16, 2026 at 2:41 PM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> __refill_objects() currently only attempts to get partial slabs from the
> local node and then allocates new slab(s). Expand it to trying also
> other nodes while observing the remote node defrag ratio, similarly to
> get_any_partial().
>
> This will prevent allocating new slabs on a node while other nodes have
> many free slabs. It does mean sheaves will contain non-local objects in
> that case. Allocations that care about specific node will still be
> served appropriately, but might get a slowpath allocation.
>
> Like get_any_partial() we do observe cpuset_zone_allowed(), although we
> might be refilling a sheaf that will be then used from a different
> allocation context.
>
> We can also use the resulting refill_objects() in
> __kmem_cache_alloc_bulk() for non-debug caches. This means
> kmem_cache_alloc_bulk() will get better performance when sheaves are
> exhausted. kmem_cache_alloc_bulk() cannot indicate a preferred node so
> it's compatible with sheaves refill in preferring the local node.
> Its users also have gfp flags that allow spinning, so document that
> as a requirement.
>
> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Reviewed-by: Suren Baghdasaryan <surenb@google.com>

> ---
>  mm/slub.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 106 insertions(+), 31 deletions(-)
>
> diff --git a/mm/slub.c b/mm/slub.c
> index d52de6e3c2d5..2c522d2bf547 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -2518,8 +2518,8 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
>  }
>
>  static unsigned int
> -__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> -                unsigned int max);
> +refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> +              unsigned int max);
>
>  static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
>                          gfp_t gfp)
> @@ -2530,8 +2530,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
>         if (!to_fill)
>                 return 0;
>
> -       filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp,
> -                       to_fill, to_fill);
> +       filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill,
> +                               to_fill);
>
>         sheaf->size += filled;
>
> @@ -6522,29 +6522,22 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
>  EXPORT_SYMBOL(kmem_cache_free_bulk);
>
>  static unsigned int
> -__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> -                unsigned int max)
> +__refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> +                     unsigned int max, struct kmem_cache_node *n)
>  {
>         struct slab *slab, *slab2;
>         struct partial_context pc;
>         unsigned int refilled = 0;
>         unsigned long flags;
>         void *object;
> -       int node;
>
>         pc.flags = gfp;
>         pc.min_objects = min;
>         pc.max_objects = max;
>
> -       node = numa_mem_id();
> -
> -       if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
> +       if (!get_partial_node_bulk(s, n, &pc))
>                 return 0;
>
> -       /* TODO: consider also other nodes? */
> -       if (!get_partial_node_bulk(s, get_node(s, node), &pc))
> -               goto new_slab;
> -
>         list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
>
>                 list_del(&slab->slab_list);
> @@ -6582,8 +6575,6 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
>         }
>
>         if (unlikely(!list_empty(&pc.slabs))) {
> -               struct kmem_cache_node *n = get_node(s, node);
> -
>                 spin_lock_irqsave(&n->list_lock, flags);
>
>                 list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
> @@ -6605,13 +6596,92 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
>                 }
>         }
>
> +       return refilled;
> +}
>
> -       if (likely(refilled >= min))
> -               goto out;
> +#ifdef CONFIG_NUMA
> +static unsigned int
> +__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> +                    unsigned int max, int local_node)
> +{
> +       struct zonelist *zonelist;
> +       struct zoneref *z;
> +       struct zone *zone;
> +       enum zone_type highest_zoneidx = gfp_zone(gfp);
> +       unsigned int cpuset_mems_cookie;
> +       unsigned int refilled = 0;
> +
> +       /* see get_any_partial() for the defrag ratio description */
> +       if (!s->remote_node_defrag_ratio ||
> +                       get_cycles() % 1024 > s->remote_node_defrag_ratio)
> +               return 0;
> +
> +       do {
> +               cpuset_mems_cookie = read_mems_allowed_begin();
> +               zonelist = node_zonelist(mempolicy_slab_node(), gfp);
> +               for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
> +                       struct kmem_cache_node *n;
> +                       unsigned int r;
> +
> +                       n = get_node(s, zone_to_nid(zone));
> +
> +                       if (!n || !cpuset_zone_allowed(zone, gfp) ||
> +                                       n->nr_partial <= s->min_partial)
> +                               continue;
> +
> +                       r = __refill_objects_node(s, p, gfp, min, max, n);
> +                       refilled += r;
> +
> +                       if (r >= min) {
> +                               /*
> +                                * Don't check read_mems_allowed_retry() here -
> +                                * if mems_allowed was updated in parallel, that
> +                                * was a harmless race between allocation and
> +                                * the cpuset update
> +                                */
> +                               return refilled;
> +                       }
> +                       p += r;
> +                       min -= r;
> +                       max -= r;
> +               }
> +       } while (read_mems_allowed_retry(cpuset_mems_cookie));
> +
> +       return refilled;
> +}
> +#else
> +static inline unsigned int
> +__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> +                    unsigned int max, int local_node)
> +{
> +       return 0;
> +}
> +#endif
> +
> +static unsigned int
> +refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> +              unsigned int max)
> +{
> +       int local_node = numa_mem_id();
> +       unsigned int refilled;
> +       struct slab *slab;
> +
> +       if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
> +               return 0;
> +
> +       refilled = __refill_objects_node(s, p, gfp, min, max,
> +                                        get_node(s, local_node));
> +       if (refilled >= min)
> +               return refilled;
> +
> +       refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled,
> +                                        max - refilled, local_node);
> +       if (refilled >= min)
> +               return refilled;
>
>  new_slab:
>
> -       slab = new_slab(s, pc.flags, node);
> +       slab = new_slab(s, gfp, local_node);
>         if (!slab)
>                 goto out;
>
> @@ -6626,8 +6696,8 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
>
>         if (refilled < min)
>                 goto new_slab;
> -out:
>
> +out:
>         return refilled;
>  }
>
> @@ -6637,18 +6707,20 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
>  {
>         int i;
>
> -       /*
> -        * TODO: this might be more efficient (if necessary) by reusing
> -        * __refill_objects()
> -        */
> -       for (i = 0; i < size; i++) {
> +       if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
> +               for (i = 0; i < size; i++) {
>
> -               p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_,
> -                                    s->object_size);
> -               if (unlikely(!p[i]))
> -                       goto error;
> +                       p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_,
> +                                            s->object_size);
> +                       if (unlikely(!p[i]))
> +                               goto error;
>
> -               maybe_wipe_obj_freeptr(s, p[i]);
> +                       maybe_wipe_obj_freeptr(s, p[i]);
> +               }
> +       } else {
> +               i = refill_objects(s, p, flags, size, size);
> +               if (i < size)
> +                       goto error;
>         }
>
>         return i;
> @@ -6659,7 +6731,10 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
>
>  }
>
> -/* Note that interrupts must be enabled when calling this function. */
> +/*
> + * Note that interrupts must be enabled when calling this function and gfp
> + * flags must allow spinning.
> + */
>  int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
>                                  void **p)
>  {
>
> --
> 2.52.0
>