[v3] slab: replace cpu (partial) slabs with sheaves

[PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Vlastimil Babka 3 weeks, 4 days ago

At this point we have sheaves enabled for all caches, but their refill
is done via __kmem_cache_alloc_bulk() which relies on cpu (partial)
slabs - now a redundant caching layer that we are about to remove.

The refill will thus be done from slabs on the node partial list.
Introduce new functions that can do that in an optimized way as it's
easier than modifying the __kmem_cache_alloc_bulk() call chain.

Extend struct partial_context so it can return a list of slabs from the
partial list with the sum of free objects in them within the requested
min and max.

Introduce get_partial_node_bulk() that removes the slabs from freelist
and returns them in the list.

Introduce get_freelist_nofreeze() which grabs the freelist without
freezing the slab.

Introduce alloc_from_new_slab() which can allocate multiple objects from
a newly allocated slab where we don't need to synchronize with freeing.
In some aspects it's similar to alloc_single_from_new_slab() but assumes
the cache is a non-debug one so it can avoid some actions.

Introduce __refill_objects() that uses the functions above to fill an
array of objects. It has to handle the possibility that the slabs will
contain more objects that were requested, due to concurrent freeing of
objects to those slabs. When no more slabs on partial lists are
available, it will allocate new slabs. It is intended to be only used
in context where spinning is allowed, so add a WARN_ON_ONCE check there.

Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are
only refilled from contexts that allow spinning, or even blocking.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 264 insertions(+), 20 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 9bea8a65e510..dce80463f92c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -246,6 +246,9 @@ struct partial_context {
 	gfp_t flags;
 	unsigned int orig_size;
 	void *object;
+	unsigned int min_objects;
+	unsigned int max_objects;
+	struct list_head slabs;
 };
 
 static inline bool kmem_cache_debug(struct kmem_cache *s)
@@ -2650,9 +2653,9 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
 	stat(s, SHEAF_FREE);
 }
 
-static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
-				   size_t size, void **p);
-
+static unsigned int
+__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+		 unsigned int max);
 
 static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
 			 gfp_t gfp)
@@ -2663,8 +2666,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
 	if (!to_fill)
 		return 0;
 
-	filled = __kmem_cache_alloc_bulk(s, gfp, to_fill,
-					 &sheaf->objects[sheaf->size]);
+	filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp,
+			to_fill, to_fill);
 
 	sheaf->size += filled;
 
@@ -3522,6 +3525,63 @@ static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
 #endif
 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
 
+static bool get_partial_node_bulk(struct kmem_cache *s,
+				  struct kmem_cache_node *n,
+				  struct partial_context *pc)
+{
+	struct slab *slab, *slab2;
+	unsigned int total_free = 0;
+	unsigned long flags;
+
+	/* Racy check to avoid taking the lock unnecessarily. */
+	if (!n || data_race(!n->nr_partial))
+		return false;
+
+	INIT_LIST_HEAD(&pc->slabs);
+
+	spin_lock_irqsave(&n->list_lock, flags);
+
+	list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
+		struct freelist_counters flc;
+		unsigned int slab_free;
+
+		if (!pfmemalloc_match(slab, pc->flags))
+			continue;
+
+		/*
+		 * determine the number of free objects in the slab racily
+		 *
+		 * due to atomic updates done by a racing free we should not
+		 * read an inconsistent value here, but do a sanity check anyway
+		 *
+		 * slab_free is a lower bound due to subsequent concurrent
+		 * freeing, the caller might get more objects than requested and
+		 * must deal with it
+		 */
+		flc.counters = data_race(READ_ONCE(slab->counters));
+		slab_free = flc.objects - flc.inuse;
+
+		if (unlikely(slab_free > oo_objects(s->oo)))
+			continue;
+
+		/* we have already min and this would get us over the max */
+		if (total_free >= pc->min_objects
+		    && total_free + slab_free > pc->max_objects)
+			break;
+
+		remove_partial(n, slab);
+
+		list_add(&slab->slab_list, &pc->slabs);
+
+		total_free += slab_free;
+		if (total_free >= pc->max_objects)
+			break;
+	}
+
+	spin_unlock_irqrestore(&n->list_lock, flags);
+	return total_free > 0;
+}
+
 /*
  * Try to allocate a partial slab from a specific node.
  */
@@ -4448,6 +4508,33 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
 	return old.freelist;
 }
 
+/*
+ * Get the slab's freelist and do not freeze it.
+ *
+ * Assumes the slab is isolated from node partial list and not frozen.
+ *
+ * Assumes this is performed only for caches without debugging so we
+ * don't need to worry about adding the slab to the full list
+ */
+static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab)
+{
+	struct freelist_counters old, new;
+
+	do {
+		old.freelist = slab->freelist;
+		old.counters = slab->counters;
+
+		new.freelist = NULL;
+		new.counters = old.counters;
+		VM_WARN_ON_ONCE(new.frozen);
+
+		new.inuse = old.objects;
+
+	} while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze"));
+
+	return old.freelist;
+}
+
 /*
  * Freeze the partial slab and return the pointer to the freelist.
  */
@@ -4471,6 +4558,65 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
 	return old.freelist;
 }
 
+/*
+ * If the object has been wiped upon free, make sure it's fully initialized by
+ * zeroing out freelist pointer.
+ *
+ * Note that we also wipe custom freelist pointers.
+ */
+static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
+						   void *obj)
+{
+	if (unlikely(slab_want_init_on_free(s)) && obj &&
+	    !freeptr_outside_object(s))
+		memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
+			0, sizeof(void *));
+}
+
+static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
+		void **p, unsigned int count, bool allow_spin)
+{
+	unsigned int allocated = 0;
+	struct kmem_cache_node *n;
+	unsigned long flags;
+	void *object;
+
+	if (!allow_spin && (slab->objects - slab->inuse) > count) {
+
+		n = get_node(s, slab_nid(slab));
+
+		if (!spin_trylock_irqsave(&n->list_lock, flags)) {
+			/* Unlucky, discard newly allocated slab */
+			defer_deactivate_slab(slab, NULL);
+			return 0;
+		}
+	}
+
+	object = slab->freelist;
+	while (object && allocated < count) {
+		p[allocated] = object;
+		object = get_freepointer(s, object);
+		maybe_wipe_obj_freeptr(s, p[allocated]);
+
+		slab->inuse++;
+		allocated++;
+	}
+	slab->freelist = object;
+
+	if (slab->freelist) {
+
+		if (allow_spin) {
+			n = get_node(s, slab_nid(slab));
+			spin_lock_irqsave(&n->list_lock, flags);
+		}
+		add_partial(n, slab, DEACTIVATE_TO_HEAD);
+		spin_unlock_irqrestore(&n->list_lock, flags);
+	}
+
+	inc_slabs_node(s, slab_nid(slab), slab->objects);
+	return allocated;
+}
+
 /*
  * Slow path. The lockless freelist is empty or we need to perform
  * debugging duties.
@@ -4913,21 +5059,6 @@ static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
 	return object;
 }
 
-/*
- * If the object has been wiped upon free, make sure it's fully initialized by
- * zeroing out freelist pointer.
- *
- * Note that we also wipe custom freelist pointers.
- */
-static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
-						   void *obj)
-{
-	if (unlikely(slab_want_init_on_free(s)) && obj &&
-	    !freeptr_outside_object(s))
-		memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
-			0, sizeof(void *));
-}
-
 static __fastpath_inline
 struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
 {
@@ -5388,6 +5519,9 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
 	return ret;
 }
 
+static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
+				   size_t size, void **p);
+
 /*
  * returns a sheaf that has at least the requested size
  * when prefilling is needed, do so with given gfp flags
@@ -7463,6 +7597,116 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 }
 EXPORT_SYMBOL(kmem_cache_free_bulk);
 
+static unsigned int
+__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+		 unsigned int max)
+{
+	struct slab *slab, *slab2;
+	struct partial_context pc;
+	unsigned int refilled = 0;
+	unsigned long flags;
+	void *object;
+	int node;
+
+	pc.flags = gfp;
+	pc.min_objects = min;
+	pc.max_objects = max;
+
+	node = numa_mem_id();
+
+	if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
+		return 0;
+
+	/* TODO: consider also other nodes? */
+	if (!get_partial_node_bulk(s, get_node(s, node), &pc))
+		goto new_slab;
+
+	list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
+
+		list_del(&slab->slab_list);
+
+		object = get_freelist_nofreeze(s, slab);
+
+		while (object && refilled < max) {
+			p[refilled] = object;
+			object = get_freepointer(s, object);
+			maybe_wipe_obj_freeptr(s, p[refilled]);
+
+			refilled++;
+		}
+
+		/*
+		 * Freelist had more objects than we can accommodate, we need to
+		 * free them back. We can treat it like a detached freelist, just
+		 * need to find the tail object.
+		 */
+		if (unlikely(object)) {
+			void *head = object;
+			void *tail;
+			int cnt = 0;
+
+			do {
+				tail = object;
+				cnt++;
+				object = get_freepointer(s, object);
+			} while (object);
+			do_slab_free(s, slab, head, tail, cnt, _RET_IP_);
+		}
+
+		if (refilled >= max)
+			break;
+	}
+
+	if (unlikely(!list_empty(&pc.slabs))) {
+		struct kmem_cache_node *n = get_node(s, node);
+
+		spin_lock_irqsave(&n->list_lock, flags);
+
+		list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
+
+			if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial))
+				continue;
+
+			list_del(&slab->slab_list);
+			add_partial(n, slab, DEACTIVATE_TO_HEAD);
+		}
+
+		spin_unlock_irqrestore(&n->list_lock, flags);
+
+		/* any slabs left are completely free and for discard */
+		list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
+
+			list_del(&slab->slab_list);
+			discard_slab(s, slab);
+		}
+	}
+
+
+	if (likely(refilled >= min))
+		goto out;
+
+new_slab:
+
+	slab = new_slab(s, pc.flags, node);
+	if (!slab)
+		goto out;
+
+	stat(s, ALLOC_SLAB);
+
+	/*
+	 * TODO: possible optimization - if we know we will consume the whole
+	 * slab we might skip creating the freelist?
+	 */
+	refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled,
+					/* allow_spin = */ true);
+
+	if (refilled < min)
+		goto new_slab;
+out:
+
+	return refilled;
+}
+
 static inline
 int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 			    void **p)

-- 
2.52.0

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Suren Baghdasaryan 2 weeks, 6 days ago

On Fri, Jan 16, 2026 at 2:40 PM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> At this point we have sheaves enabled for all caches, but their refill
> is done via __kmem_cache_alloc_bulk() which relies on cpu (partial)
> slabs - now a redundant caching layer that we are about to remove.
>
> The refill will thus be done from slabs on the node partial list.
> Introduce new functions that can do that in an optimized way as it's
> easier than modifying the __kmem_cache_alloc_bulk() call chain.
>
> Extend struct partial_context so it can return a list of slabs from the
> partial list with the sum of free objects in them within the requested
> min and max.
>
> Introduce get_partial_node_bulk() that removes the slabs from freelist
> and returns them in the list.
>
> Introduce get_freelist_nofreeze() which grabs the freelist without
> freezing the slab.
>
> Introduce alloc_from_new_slab() which can allocate multiple objects from
> a newly allocated slab where we don't need to synchronize with freeing.
> In some aspects it's similar to alloc_single_from_new_slab() but assumes
> the cache is a non-debug one so it can avoid some actions.
>
> Introduce __refill_objects() that uses the functions above to fill an
> array of objects. It has to handle the possibility that the slabs will
> contain more objects that were requested, due to concurrent freeing of
> objects to those slabs. When no more slabs on partial lists are
> available, it will allocate new slabs. It is intended to be only used
> in context where spinning is allowed, so add a WARN_ON_ONCE check there.
>
> Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are
> only refilled from contexts that allow spinning, or even blocking.
>

Some nits, but otherwise LGTM.
Reviewed-by: Suren Baghdasaryan <surenb@google.com>

> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/slub.c | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 264 insertions(+), 20 deletions(-)
>
> diff --git a/mm/slub.c b/mm/slub.c
> index 9bea8a65e510..dce80463f92c 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -246,6 +246,9 @@ struct partial_context {
>         gfp_t flags;
>         unsigned int orig_size;
>         void *object;
> +       unsigned int min_objects;
> +       unsigned int max_objects;
> +       struct list_head slabs;
>  };
>
>  static inline bool kmem_cache_debug(struct kmem_cache *s)
> @@ -2650,9 +2653,9 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
>         stat(s, SHEAF_FREE);
>  }
>
> -static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
> -                                  size_t size, void **p);
> -
> +static unsigned int
> +__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> +                unsigned int max);
>
>  static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
>                          gfp_t gfp)
> @@ -2663,8 +2666,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
>         if (!to_fill)
>                 return 0;
>
> -       filled = __kmem_cache_alloc_bulk(s, gfp, to_fill,
> -                                        &sheaf->objects[sheaf->size]);
> +       filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp,
> +                       to_fill, to_fill);
>
>         sheaf->size += filled;
>
> @@ -3522,6 +3525,63 @@ static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
>  #endif
>  static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
>
> +static bool get_partial_node_bulk(struct kmem_cache *s,
> +                                 struct kmem_cache_node *n,
> +                                 struct partial_context *pc)
> +{
> +       struct slab *slab, *slab2;
> +       unsigned int total_free = 0;
> +       unsigned long flags;
> +
> +       /* Racy check to avoid taking the lock unnecessarily. */
> +       if (!n || data_race(!n->nr_partial))
> +               return false;
> +
> +       INIT_LIST_HEAD(&pc->slabs);
> +
> +       spin_lock_irqsave(&n->list_lock, flags);
> +
> +       list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
> +               struct freelist_counters flc;
> +               unsigned int slab_free;
> +
> +               if (!pfmemalloc_match(slab, pc->flags))
> +                       continue;
> +
> +               /*
> +                * determine the number of free objects in the slab racily
> +                *
> +                * due to atomic updates done by a racing free we should not
> +                * read an inconsistent value here, but do a sanity check anyway
> +                *
> +                * slab_free is a lower bound due to subsequent concurrent
> +                * freeing, the caller might get more objects than requested and
> +                * must deal with it
> +                */
> +               flc.counters = data_race(READ_ONCE(slab->counters));
> +               slab_free = flc.objects - flc.inuse;
> +
> +               if (unlikely(slab_free > oo_objects(s->oo)))
> +                       continue;
> +
> +               /* we have already min and this would get us over the max */
> +               if (total_free >= pc->min_objects
> +                   && total_free + slab_free > pc->max_objects)
> +                       break;
> +
> +               remove_partial(n, slab);
> +
> +               list_add(&slab->slab_list, &pc->slabs);
> +
> +               total_free += slab_free;
> +               if (total_free >= pc->max_objects)
> +                       break;

From the above code it seems like you are trying to get at least
pc->min_objects and as close as possible to the pc->max_objects
without exceeding it (with a possibility that we will exceed both
min_objects and max_objects in one step). Is that indeed the intent?
Because otherwise could could simplify these conditions to stop once
you crossed pc->min_objects.

> +       }
> +
> +       spin_unlock_irqrestore(&n->list_lock, flags);
> +       return total_free > 0;
> +}
> +
>  /*
>   * Try to allocate a partial slab from a specific node.
>   */
> @@ -4448,6 +4508,33 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
>         return old.freelist;
>  }
>
> +/*
> + * Get the slab's freelist and do not freeze it.
> + *
> + * Assumes the slab is isolated from node partial list and not frozen.
> + *
> + * Assumes this is performed only for caches without debugging so we
> + * don't need to worry about adding the slab to the full list

nit: Missing a period sign at the end of the above sentence.

> + */
> +static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab)

I was going to comment on similarities between
get_freelist_nofreeze(), get_freelist() and freeze_slab() and
possibility of consolidating them but then I saw you removing the
other functions in the next patch. So, I'm mentioning it here merely
for other reviewers not to trip on this.

> +{
> +       struct freelist_counters old, new;
> +
> +       do {
> +               old.freelist = slab->freelist;
> +               old.counters = slab->counters;
> +
> +               new.freelist = NULL;
> +               new.counters = old.counters;
> +               VM_WARN_ON_ONCE(new.frozen);
> +
> +               new.inuse = old.objects;
> +
> +       } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze"));
> +
> +       return old.freelist;
> +}
> +
>  /*
>   * Freeze the partial slab and return the pointer to the freelist.
>   */
> @@ -4471,6 +4558,65 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
>         return old.freelist;
>  }
>
> +/*
> + * If the object has been wiped upon free, make sure it's fully initialized by
> + * zeroing out freelist pointer.
> + *
> + * Note that we also wipe custom freelist pointers.
> + */
> +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
> +                                                  void *obj)
> +{
> +       if (unlikely(slab_want_init_on_free(s)) && obj &&
> +           !freeptr_outside_object(s))
> +               memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
> +                       0, sizeof(void *));
> +}
> +
> +static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
> +               void **p, unsigned int count, bool allow_spin)
> +{
> +       unsigned int allocated = 0;
> +       struct kmem_cache_node *n;
> +       unsigned long flags;
> +       void *object;
> +
> +       if (!allow_spin && (slab->objects - slab->inuse) > count) {
> +
> +               n = get_node(s, slab_nid(slab));
> +
> +               if (!spin_trylock_irqsave(&n->list_lock, flags)) {
> +                       /* Unlucky, discard newly allocated slab */
> +                       defer_deactivate_slab(slab, NULL);
> +                       return 0;
> +               }
> +       }
> +
> +       object = slab->freelist;
> +       while (object && allocated < count) {
> +               p[allocated] = object;
> +               object = get_freepointer(s, object);
> +               maybe_wipe_obj_freeptr(s, p[allocated]);
> +
> +               slab->inuse++;
> +               allocated++;
> +       }
> +       slab->freelist = object;
> +
> +       if (slab->freelist) {

nit: It's a bit subtle that the checks for slab->freelist here and the
earlier one for ((slab->objects - slab->inuse) > count) are
effectively equivalent. That's because this is a new slab and objects
can't be freed into it concurrently. I would feel better if both
checks were explicitly the same, like having "bool extra_objs =
(slab->objects - slab->inuse) > count;" and use it for both checks.
But this is minor, so feel free to ignore.

> +
> +               if (allow_spin) {
> +                       n = get_node(s, slab_nid(slab));
> +                       spin_lock_irqsave(&n->list_lock, flags);
> +               }
> +               add_partial(n, slab, DEACTIVATE_TO_HEAD);
> +               spin_unlock_irqrestore(&n->list_lock, flags);
> +       }
> +
> +       inc_slabs_node(s, slab_nid(slab), slab->objects);
> +       return allocated;
> +}
> +
>  /*
>   * Slow path. The lockless freelist is empty or we need to perform
>   * debugging duties.
> @@ -4913,21 +5059,6 @@ static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
>         return object;
>  }
>
> -/*
> - * If the object has been wiped upon free, make sure it's fully initialized by
> - * zeroing out freelist pointer.
> - *
> - * Note that we also wipe custom freelist pointers.
> - */
> -static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
> -                                                  void *obj)
> -{
> -       if (unlikely(slab_want_init_on_free(s)) && obj &&
> -           !freeptr_outside_object(s))
> -               memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
> -                       0, sizeof(void *));
> -}
> -
>  static __fastpath_inline
>  struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
>  {
> @@ -5388,6 +5519,9 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
>         return ret;
>  }
>
> +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
> +                                  size_t size, void **p);
> +
>  /*
>   * returns a sheaf that has at least the requested size
>   * when prefilling is needed, do so with given gfp flags
> @@ -7463,6 +7597,116 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
>  }
>  EXPORT_SYMBOL(kmem_cache_free_bulk);
>
> +static unsigned int
> +__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> +                unsigned int max)
> +{
> +       struct slab *slab, *slab2;
> +       struct partial_context pc;
> +       unsigned int refilled = 0;
> +       unsigned long flags;
> +       void *object;
> +       int node;
> +
> +       pc.flags = gfp;
> +       pc.min_objects = min;
> +       pc.max_objects = max;
> +
> +       node = numa_mem_id();
> +
> +       if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
> +               return 0;
> +
> +       /* TODO: consider also other nodes? */
> +       if (!get_partial_node_bulk(s, get_node(s, node), &pc))
> +               goto new_slab;
> +
> +       list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
> +
> +               list_del(&slab->slab_list);
> +
> +               object = get_freelist_nofreeze(s, slab);
> +
> +               while (object && refilled < max) {
> +                       p[refilled] = object;
> +                       object = get_freepointer(s, object);
> +                       maybe_wipe_obj_freeptr(s, p[refilled]);
> +
> +                       refilled++;
> +               }
> +
> +               /*
> +                * Freelist had more objects than we can accommodate, we need to
> +                * free them back. We can treat it like a detached freelist, just
> +                * need to find the tail object.
> +                */
> +               if (unlikely(object)) {
> +                       void *head = object;
> +                       void *tail;
> +                       int cnt = 0;
> +
> +                       do {
> +                               tail = object;
> +                               cnt++;
> +                               object = get_freepointer(s, object);
> +                       } while (object);
> +                       do_slab_free(s, slab, head, tail, cnt, _RET_IP_);
> +               }
> +
> +               if (refilled >= max)
> +                       break;
> +       }
> +
> +       if (unlikely(!list_empty(&pc.slabs))) {
> +               struct kmem_cache_node *n = get_node(s, node);
> +
> +               spin_lock_irqsave(&n->list_lock, flags);
> +
> +               list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
> +
> +                       if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial))
> +                               continue;
> +
> +                       list_del(&slab->slab_list);
> +                       add_partial(n, slab, DEACTIVATE_TO_HEAD);
> +               }
> +
> +               spin_unlock_irqrestore(&n->list_lock, flags);
> +
> +               /* any slabs left are completely free and for discard */
> +               list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
> +
> +                       list_del(&slab->slab_list);
> +                       discard_slab(s, slab);
> +               }
> +       }
> +
> +
> +       if (likely(refilled >= min))
> +               goto out;
> +
> +new_slab:
> +
> +       slab = new_slab(s, pc.flags, node);
> +       if (!slab)
> +               goto out;
> +
> +       stat(s, ALLOC_SLAB);
> +
> +       /*
> +        * TODO: possible optimization - if we know we will consume the whole
> +        * slab we might skip creating the freelist?
> +        */
> +       refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled,
> +                                       /* allow_spin = */ true);
> +
> +       if (refilled < min)
> +               goto new_slab;

Ok, allow_spin=true saves us from a potential infinite loop here. LGTM.

> +out:
> +
> +       return refilled;
> +}
> +
>  static inline
>  int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
>                             void **p)
>
> --
> 2.52.0
>

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Vlastimil Babka 2 weeks, 6 days ago

On 1/20/26 18:19, Suren Baghdasaryan wrote:
> On Fri, Jan 16, 2026 at 2:40 PM Vlastimil Babka <vbabka@suse.cz> wrote:
>>
>> At this point we have sheaves enabled for all caches, but their refill
>> is done via __kmem_cache_alloc_bulk() which relies on cpu (partial)
>> slabs - now a redundant caching layer that we are about to remove.
>>
>> The refill will thus be done from slabs on the node partial list.
>> Introduce new functions that can do that in an optimized way as it's
>> easier than modifying the __kmem_cache_alloc_bulk() call chain.
>>
>> Extend struct partial_context so it can return a list of slabs from the
>> partial list with the sum of free objects in them within the requested
>> min and max.
>>
>> Introduce get_partial_node_bulk() that removes the slabs from freelist
>> and returns them in the list.
>>
>> Introduce get_freelist_nofreeze() which grabs the freelist without
>> freezing the slab.
>>
>> Introduce alloc_from_new_slab() which can allocate multiple objects from
>> a newly allocated slab where we don't need to synchronize with freeing.
>> In some aspects it's similar to alloc_single_from_new_slab() but assumes
>> the cache is a non-debug one so it can avoid some actions.
>>
>> Introduce __refill_objects() that uses the functions above to fill an
>> array of objects. It has to handle the possibility that the slabs will
>> contain more objects that were requested, due to concurrent freeing of
>> objects to those slabs. When no more slabs on partial lists are
>> available, it will allocate new slabs. It is intended to be only used
>> in context where spinning is allowed, so add a WARN_ON_ONCE check there.
>>
>> Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are
>> only refilled from contexts that allow spinning, or even blocking.
>>
> 
> Some nits, but otherwise LGTM.
> Reviewed-by: Suren Baghdasaryan <surenb@google.com>

Thanks.

> 
> From the above code it seems like you are trying to get at least
> pc->min_objects and as close as possible to the pc->max_objects
> without exceeding it (with a possibility that we will exceed both
> min_objects and max_objects in one step). Is that indeed the intent?
> Because otherwise could could simplify these conditions to stop once
> you crossed pc->min_objects.

Yeah see my reply to Harry, it's for future tuning.
 
>> +       if (slab->freelist) {
> 
> nit: It's a bit subtle that the checks for slab->freelist here and the
> earlier one for ((slab->objects - slab->inuse) > count) are
> effectively equivalent. That's because this is a new slab and objects
> can't be freed into it concurrently. I would feel better if both
> checks were explicitly the same, like having "bool extra_objs =
> (slab->objects - slab->inuse) > count;" and use it for both checks.
> But this is minor, so feel free to ignore.

OK, doing this for your and Hao Li's comment:

diff --git a/mm/slub.c b/mm/slub.c
index d6fde1d60ae9..015bdef11eb6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4505,7 +4505,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
  * Assumes the slab is isolated from node partial list and not frozen.
  *
  * Assumes this is performed only for caches without debugging so we
- * don't need to worry about adding the slab to the full list
+ * don't need to worry about adding the slab to the full list.
  */
 static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab)
 {
@@ -4569,10 +4569,17 @@ static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
 {
        unsigned int allocated = 0;
        struct kmem_cache_node *n;
+       bool needs_add_partial;
        unsigned long flags;
        void *object;
 
-       if (!allow_spin && (slab->objects - slab->inuse) > count) {
+       /*
+        * Are we going to put the slab on the partial list?
+        * Note slab->inuse is 0 on a new slab.
+        */
+       needs_add_partial = (slab->objects > count);
+
+       if (!allow_spin && needs_add_partial) {
 
                n = get_node(s, slab_nid(slab));
 
@@ -4594,7 +4601,7 @@ static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
        }
        slab->freelist = object;
 
-       if (slab->freelist) {
+       if (needs_add_partial) {
 
                if (allow_spin) {
                        n = get_node(s, slab_nid(slab));

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Suren Baghdasaryan 2 weeks, 6 days ago

On Wed, Jan 21, 2026 at 1:22 PM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 1/20/26 18:19, Suren Baghdasaryan wrote:
> > On Fri, Jan 16, 2026 at 2:40 PM Vlastimil Babka <vbabka@suse.cz> wrote:
> >>
> >> At this point we have sheaves enabled for all caches, but their refill
> >> is done via __kmem_cache_alloc_bulk() which relies on cpu (partial)
> >> slabs - now a redundant caching layer that we are about to remove.
> >>
> >> The refill will thus be done from slabs on the node partial list.
> >> Introduce new functions that can do that in an optimized way as it's
> >> easier than modifying the __kmem_cache_alloc_bulk() call chain.
> >>
> >> Extend struct partial_context so it can return a list of slabs from the
> >> partial list with the sum of free objects in them within the requested
> >> min and max.
> >>
> >> Introduce get_partial_node_bulk() that removes the slabs from freelist
> >> and returns them in the list.
> >>
> >> Introduce get_freelist_nofreeze() which grabs the freelist without
> >> freezing the slab.
> >>
> >> Introduce alloc_from_new_slab() which can allocate multiple objects from
> >> a newly allocated slab where we don't need to synchronize with freeing.
> >> In some aspects it's similar to alloc_single_from_new_slab() but assumes
> >> the cache is a non-debug one so it can avoid some actions.
> >>
> >> Introduce __refill_objects() that uses the functions above to fill an
> >> array of objects. It has to handle the possibility that the slabs will
> >> contain more objects that were requested, due to concurrent freeing of
> >> objects to those slabs. When no more slabs on partial lists are
> >> available, it will allocate new slabs. It is intended to be only used
> >> in context where spinning is allowed, so add a WARN_ON_ONCE check there.
> >>
> >> Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are
> >> only refilled from contexts that allow spinning, or even blocking.
> >>
> >
> > Some nits, but otherwise LGTM.
> > Reviewed-by: Suren Baghdasaryan <surenb@google.com>
>
> Thanks.
>
> >
> > From the above code it seems like you are trying to get at least
> > pc->min_objects and as close as possible to the pc->max_objects
> > without exceeding it (with a possibility that we will exceed both
> > min_objects and max_objects in one step). Is that indeed the intent?
> > Because otherwise could could simplify these conditions to stop once
> > you crossed pc->min_objects.
>
> Yeah see my reply to Harry, it's for future tuning.

Ok.

>
> >> +       if (slab->freelist) {
> >
> > nit: It's a bit subtle that the checks for slab->freelist here and the
> > earlier one for ((slab->objects - slab->inuse) > count) are
> > effectively equivalent. That's because this is a new slab and objects
> > can't be freed into it concurrently. I would feel better if both
> > checks were explicitly the same, like having "bool extra_objs =
> > (slab->objects - slab->inuse) > count;" and use it for both checks.
> > But this is minor, so feel free to ignore.
>
> OK, doing this for your and Hao Li's comment:

Sounds good. Thanks!

>
> diff --git a/mm/slub.c b/mm/slub.c
> index d6fde1d60ae9..015bdef11eb6 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -4505,7 +4505,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
>   * Assumes the slab is isolated from node partial list and not frozen.
>   *
>   * Assumes this is performed only for caches without debugging so we
> - * don't need to worry about adding the slab to the full list
> + * don't need to worry about adding the slab to the full list.
>   */
>  static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab)
>  {
> @@ -4569,10 +4569,17 @@ static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
>  {
>         unsigned int allocated = 0;
>         struct kmem_cache_node *n;
> +       bool needs_add_partial;
>         unsigned long flags;
>         void *object;
>
> -       if (!allow_spin && (slab->objects - slab->inuse) > count) {
> +       /*
> +        * Are we going to put the slab on the partial list?
> +        * Note slab->inuse is 0 on a new slab.
> +        */
> +       needs_add_partial = (slab->objects > count);
> +
> +       if (!allow_spin && needs_add_partial) {
>
>                 n = get_node(s, slab_nid(slab));
>
> @@ -4594,7 +4601,7 @@ static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
>         }
>         slab->freelist = object;
>
> -       if (slab->freelist) {
> +       if (needs_add_partial) {
>
>                 if (allow_spin) {
>                         n = get_node(s, slab_nid(slab));
>

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Hao Li 3 weeks ago

On Fri, Jan 16, 2026 at 03:40:29PM +0100, Vlastimil Babka wrote:
> At this point we have sheaves enabled for all caches, but their refill
> is done via __kmem_cache_alloc_bulk() which relies on cpu (partial)
> slabs - now a redundant caching layer that we are about to remove.
> 
> The refill will thus be done from slabs on the node partial list.
> Introduce new functions that can do that in an optimized way as it's
> easier than modifying the __kmem_cache_alloc_bulk() call chain.
> 
> Extend struct partial_context so it can return a list of slabs from the
> partial list with the sum of free objects in them within the requested
> min and max.
> 
> Introduce get_partial_node_bulk() that removes the slabs from freelist
> and returns them in the list.
> 
> Introduce get_freelist_nofreeze() which grabs the freelist without
> freezing the slab.
> 
> Introduce alloc_from_new_slab() which can allocate multiple objects from
> a newly allocated slab where we don't need to synchronize with freeing.
> In some aspects it's similar to alloc_single_from_new_slab() but assumes
> the cache is a non-debug one so it can avoid some actions.
> 
> Introduce __refill_objects() that uses the functions above to fill an
> array of objects. It has to handle the possibility that the slabs will
> contain more objects that were requested, due to concurrent freeing of
> objects to those slabs. When no more slabs on partial lists are
> available, it will allocate new slabs. It is intended to be only used
> in context where spinning is allowed, so add a WARN_ON_ONCE check there.
> 
> Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are
> only refilled from contexts that allow spinning, or even blocking.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/slub.c | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 264 insertions(+), 20 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 9bea8a65e510..dce80463f92c 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -246,6 +246,9 @@ struct partial_context {
>  	gfp_t flags;
>  	unsigned int orig_size;
>  	void *object;
> +	unsigned int min_objects;
> +	unsigned int max_objects;
> +	struct list_head slabs;
>  };
>  
...
> +static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
> +		void **p, unsigned int count, bool allow_spin)
> +{
> +	unsigned int allocated = 0;
> +	struct kmem_cache_node *n;
> +	unsigned long flags;
> +	void *object;
> +
> +	if (!allow_spin && (slab->objects - slab->inuse) > count) {

I was wondering - given that slab->inuse is 0 for a newly allocated slab, is
there a reason to use "slab->objects - slab->inuse" instead of simply
slab->objects.

> +
> +		n = get_node(s, slab_nid(slab));
> +
> +		if (!spin_trylock_irqsave(&n->list_lock, flags)) {
> +			/* Unlucky, discard newly allocated slab */
> +			defer_deactivate_slab(slab, NULL);
> +			return 0;
> +		}
> +	}
> +
> +	object = slab->freelist;
> +	while (object && allocated < count) {
> +		p[allocated] = object;
> +		object = get_freepointer(s, object);
> +		maybe_wipe_obj_freeptr(s, p[allocated]);
> +
> +		slab->inuse++;
> +		allocated++;
> +	}
> +	slab->freelist = object;
> +
> +	if (slab->freelist) {
> +
> +		if (allow_spin) {
> +			n = get_node(s, slab_nid(slab));
> +			spin_lock_irqsave(&n->list_lock, flags);
> +		}
> +		add_partial(n, slab, DEACTIVATE_TO_HEAD);
> +		spin_unlock_irqrestore(&n->list_lock, flags);
> +	}
> +
> +	inc_slabs_node(s, slab_nid(slab), slab->objects);
> +	return allocated;
> +}
> +
...

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Harry Yoo 3 weeks ago

On Fri, Jan 16, 2026 at 03:40:29PM +0100, Vlastimil Babka wrote:
> At this point we have sheaves enabled for all caches, but their refill
> is done via __kmem_cache_alloc_bulk() which relies on cpu (partial)
> slabs - now a redundant caching layer that we are about to remove.
> 
> The refill will thus be done from slabs on the node partial list.
> Introduce new functions that can do that in an optimized way as it's
> easier than modifying the __kmem_cache_alloc_bulk() call chain.
> 
> Extend struct partial_context so it can return a list of slabs from the
> partial list with the sum of free objects in them within the requested
> min and max.
> 
> Introduce get_partial_node_bulk() that removes the slabs from freelist
> and returns them in the list.
> 
> Introduce get_freelist_nofreeze() which grabs the freelist without
> freezing the slab.
> 
> Introduce alloc_from_new_slab() which can allocate multiple objects from
> a newly allocated slab where we don't need to synchronize with freeing.
> In some aspects it's similar to alloc_single_from_new_slab() but assumes
> the cache is a non-debug one so it can avoid some actions.
> 
> Introduce __refill_objects() that uses the functions above to fill an
> array of objects. It has to handle the possibility that the slabs will
> contain more objects that were requested, due to concurrent freeing of
> objects to those slabs. When no more slabs on partial lists are
> available, it will allocate new slabs. It is intended to be only used
> in context where spinning is allowed, so add a WARN_ON_ONCE check there.
> 
> Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are
> only refilled from contexts that allow spinning, or even blocking.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/slub.c | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 264 insertions(+), 20 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 9bea8a65e510..dce80463f92c 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -246,6 +246,9 @@ struct partial_context {
>  	gfp_t flags;
>  	unsigned int orig_size;
>  	void *object;
> +	unsigned int min_objects;
> +	unsigned int max_objects;
> +	struct list_head slabs;
>  };
>  
>  static inline bool kmem_cache_debug(struct kmem_cache *s)
> @@ -2663,8 +2666,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
>  	if (!to_fill)
>  		return 0;
>  
> -	filled = __kmem_cache_alloc_bulk(s, gfp, to_fill,
> -					 &sheaf->objects[sheaf->size]);
> +	filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp,
> +			to_fill, to_fill);

nit: perhaps handling min and max separately is unnecessary
if it's always min == max? we could have simply one 'count' or 'size'?

Otherwise LGTM!

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Vlastimil Babka 3 weeks ago

On 1/20/26 03:32, Harry Yoo wrote:
> On Fri, Jan 16, 2026 at 03:40:29PM +0100, Vlastimil Babka wrote:
>> At this point we have sheaves enabled for all caches, but their refill
>> is done via __kmem_cache_alloc_bulk() which relies on cpu (partial)
>> slabs - now a redundant caching layer that we are about to remove.
>> 
>> The refill will thus be done from slabs on the node partial list.
>> Introduce new functions that can do that in an optimized way as it's
>> easier than modifying the __kmem_cache_alloc_bulk() call chain.
>> 
>> Extend struct partial_context so it can return a list of slabs from the
>> partial list with the sum of free objects in them within the requested
>> min and max.
>> 
>> Introduce get_partial_node_bulk() that removes the slabs from freelist
>> and returns them in the list.
>> 
>> Introduce get_freelist_nofreeze() which grabs the freelist without
>> freezing the slab.
>> 
>> Introduce alloc_from_new_slab() which can allocate multiple objects from
>> a newly allocated slab where we don't need to synchronize with freeing.
>> In some aspects it's similar to alloc_single_from_new_slab() but assumes
>> the cache is a non-debug one so it can avoid some actions.
>> 
>> Introduce __refill_objects() that uses the functions above to fill an
>> array of objects. It has to handle the possibility that the slabs will
>> contain more objects that were requested, due to concurrent freeing of
>> objects to those slabs. When no more slabs on partial lists are
>> available, it will allocate new slabs. It is intended to be only used
>> in context where spinning is allowed, so add a WARN_ON_ONCE check there.
>> 
>> Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are
>> only refilled from contexts that allow spinning, or even blocking.
>> 
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> ---
>>  mm/slub.c | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>>  1 file changed, 264 insertions(+), 20 deletions(-)
>> 
>> diff --git a/mm/slub.c b/mm/slub.c
>> index 9bea8a65e510..dce80463f92c 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -246,6 +246,9 @@ struct partial_context {
>>  	gfp_t flags;
>>  	unsigned int orig_size;
>>  	void *object;
>> +	unsigned int min_objects;
>> +	unsigned int max_objects;
>> +	struct list_head slabs;
>>  };
>>  
>>  static inline bool kmem_cache_debug(struct kmem_cache *s)
>> @@ -2663,8 +2666,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
>>  	if (!to_fill)
>>  		return 0;
>>  
>> -	filled = __kmem_cache_alloc_bulk(s, gfp, to_fill,
>> -					 &sheaf->objects[sheaf->size]);
>> +	filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp,
>> +			to_fill, to_fill);
> 
> nit: perhaps handling min and max separately is unnecessary
> if it's always min == max? we could have simply one 'count' or 'size'?

Right, so the plan was to set min to some fraction of max when refilling
sheaves, with the goal of maximizing the chance that once we grab a slab
from the partial list, we almost certainly fully use it and don't have to
return it back. But I didn't get to there yet. It seems worthwile to try
though so we can leave the implementation prepared for it?

> Otherwise LGTM!
>

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Harry Yoo 3 weeks ago

On Tue, Jan 20, 2026 at 07:33:47AM +0100, Vlastimil Babka wrote:
> On 1/20/26 03:32, Harry Yoo wrote:
> > On Fri, Jan 16, 2026 at 03:40:29PM +0100, Vlastimil Babka wrote:
> >> At this point we have sheaves enabled for all caches, but their refill
> >> is done via __kmem_cache_alloc_bulk() which relies on cpu (partial)
> >> slabs - now a redundant caching layer that we are about to remove.
> >> 
> >> The refill will thus be done from slabs on the node partial list.
> >> Introduce new functions that can do that in an optimized way as it's
> >> easier than modifying the __kmem_cache_alloc_bulk() call chain.
> >> 
> >> Extend struct partial_context so it can return a list of slabs from the
> >> partial list with the sum of free objects in them within the requested
> >> min and max.
> >> 
> >> Introduce get_partial_node_bulk() that removes the slabs from freelist
> >> and returns them in the list.
> >> 
> >> Introduce get_freelist_nofreeze() which grabs the freelist without
> >> freezing the slab.
> >> 
> >> Introduce alloc_from_new_slab() which can allocate multiple objects from
> >> a newly allocated slab where we don't need to synchronize with freeing.
> >> In some aspects it's similar to alloc_single_from_new_slab() but assumes
> >> the cache is a non-debug one so it can avoid some actions.
> >> 
> >> Introduce __refill_objects() that uses the functions above to fill an
> >> array of objects. It has to handle the possibility that the slabs will
> >> contain more objects that were requested, due to concurrent freeing of
> >> objects to those slabs. When no more slabs on partial lists are
> >> available, it will allocate new slabs. It is intended to be only used
> >> in context where spinning is allowed, so add a WARN_ON_ONCE check there.
> >> 
> >> Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are
> >> only refilled from contexts that allow spinning, or even blocking.
> >> 
> >> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >> ---
> >>  mm/slub.c | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
> >>  1 file changed, 264 insertions(+), 20 deletions(-)
> >> 
> >> diff --git a/mm/slub.c b/mm/slub.c
> >> index 9bea8a65e510..dce80463f92c 100644
> >> --- a/mm/slub.c
> >> +++ b/mm/slub.c
> >> @@ -246,6 +246,9 @@ struct partial_context {
> >>  	gfp_t flags;
> >>  	unsigned int orig_size;
> >>  	void *object;
> >> +	unsigned int min_objects;
> >> +	unsigned int max_objects;
> >> +	struct list_head slabs;
> >>  };
> >>  
> >>  static inline bool kmem_cache_debug(struct kmem_cache *s)
> >> @@ -2663,8 +2666,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
> >>  	if (!to_fill)
> >>  		return 0;
> >>  
> >> -	filled = __kmem_cache_alloc_bulk(s, gfp, to_fill,
> >> -					 &sheaf->objects[sheaf->size]);
> >> +	filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp,
> >> +			to_fill, to_fill);
> > 
> > nit: perhaps handling min and max separately is unnecessary
> > if it's always min == max? we could have simply one 'count' or 'size'?
> 
> Right, so the plan was to set min to some fraction of max when refilling
> sheaves, with the goal of maximizing the chance that once we grab a slab
> from the partial list, we almost certainly fully use it and don't have to
> return it back.

Oh, you had a plan!

I'm having trouble imagining what it would look like though.
If we fetch more objects than `to_fill`, where do they go?
Have a larger array and fill multiple sheaves with it?

> But I didn't get to there yet. It seems worthwile to try
> though so we can leave the implementation prepared for it?

Yeah that's fine.

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Vlastimil Babka 3 weeks ago

On 1/20/26 11:27, Harry Yoo wrote:
> On Tue, Jan 20, 2026 at 07:33:47AM +0100, Vlastimil Babka wrote:
>> 
>> Right, so the plan was to set min to some fraction of max when refilling
>> sheaves, with the goal of maximizing the chance that once we grab a slab
>> from the partial list, we almost certainly fully use it and don't have to
>> return it back.
> 
> Oh, you had a plan!
> 
> I'm having trouble imagining what it would look like though.
> If we fetch more objects than `to_fill`, where do they go?
> Have a larger array and fill multiple sheaves with it?

Ah that wouldn't happen. Rather we would consider sheaf to be full even if
it was filled a bit below its capacity, if trying to reach the full capacity
would mean taking a slab from partial list, not using all objects from it
and having to return it to the list.
Of course this would not apply for a prefilled sheaf request or
kmem_cache_alloc_bulk().

>> But I didn't get to there yet. It seems worthwile to try
>> though so we can leave the implementation prepared for it?
> 
> Yeah that's fine.
>

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Harry Yoo 3 weeks, 1 day ago

On Fri, Jan 16, 2026 at 03:40:29PM +0100, Vlastimil Babka wrote:
> At this point we have sheaves enabled for all caches, but their refill
> is done via __kmem_cache_alloc_bulk() which relies on cpu (partial)
> slabs - now a redundant caching layer that we are about to remove.
> 
> The refill will thus be done from slabs on the node partial list.
> Introduce new functions that can do that in an optimized way as it's
> easier than modifying the __kmem_cache_alloc_bulk() call chain.
> 
> Extend struct partial_context so it can return a list of slabs from the
> partial list with the sum of free objects in them within the requested
> min and max.
> 
> Introduce get_partial_node_bulk() that removes the slabs from freelist
> and returns them in the list.
> 
> Introduce get_freelist_nofreeze() which grabs the freelist without
> freezing the slab.
> 
> Introduce alloc_from_new_slab() which can allocate multiple objects from
> a newly allocated slab where we don't need to synchronize with freeing.
> In some aspects it's similar to alloc_single_from_new_slab() but assumes
> the cache is a non-debug one so it can avoid some actions.
> 
> Introduce __refill_objects() that uses the functions above to fill an
> array of objects. It has to handle the possibility that the slabs will
> contain more objects that were requested, due to concurrent freeing of
> objects to those slabs. When no more slabs on partial lists are
> available, it will allocate new slabs. It is intended to be only used
> in context where spinning is allowed, so add a WARN_ON_ONCE check there.
> 
> Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are
> only refilled from contexts that allow spinning, or even blocking.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/slub.c | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 264 insertions(+), 20 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 9bea8a65e510..dce80463f92c 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -3522,6 +3525,63 @@ static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
>  #endif
>  static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
>  
> +static bool get_partial_node_bulk(struct kmem_cache *s,
> +				  struct kmem_cache_node *n,
> +				  struct partial_context *pc)
> +{
> +	struct slab *slab, *slab2;
> +	unsigned int total_free = 0;
> +	unsigned long flags;
> +
> +	/* Racy check to avoid taking the lock unnecessarily. */
> +	if (!n || data_race(!n->nr_partial))
> +		return false;
> +
> +	INIT_LIST_HEAD(&pc->slabs);
> +
> +	spin_lock_irqsave(&n->list_lock, flags);
> +
> +	list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
> +		struct freelist_counters flc;
> +		unsigned int slab_free;
> +
> +		if (!pfmemalloc_match(slab, pc->flags))
> +			continue;
> +		/*
> +		 * determine the number of free objects in the slab racily
> +		 *
> +		 * due to atomic updates done by a racing free we should not
> +		 * read an inconsistent value here, but do a sanity check anyway
> +		 *
> +		 * slab_free is a lower bound due to subsequent concurrent
> +		 * freeing, the caller might get more objects than requested and
> +		 * must deal with it
> +		 */
> +		flc.counters = data_race(READ_ONCE(slab->counters));
> +		slab_free = flc.objects - flc.inuse;
> +
> +		if (unlikely(slab_free > oo_objects(s->oo)))
> +			continue;

When is this condition supposed to be true?

I guess it's when __update_freelist_slow() doesn't update
slab->counters atomically?

> +
> +		/* we have already min and this would get us over the max */
> +		if (total_free >= pc->min_objects
> +		    && total_free + slab_free > pc->max_objects)
> +			break;
> +
> +		remove_partial(n, slab);
> +
> +		list_add(&slab->slab_list, &pc->slabs);
> +
> +		total_free += slab_free;
> +		if (total_free >= pc->max_objects)
> +			break;
> +	}
> +
> +	spin_unlock_irqrestore(&n->list_lock, flags);
> +	return total_free > 0;
> +}
> +
>  /*
>   * Try to allocate a partial slab from a specific node.
>   */
> +static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
> +		void **p, unsigned int count, bool allow_spin)
> +{
> +	unsigned int allocated = 0;
> +	struct kmem_cache_node *n;
> +	unsigned long flags;
> +	void *object;
> +
> +	if (!allow_spin && (slab->objects - slab->inuse) > count) {
> +
> +		n = get_node(s, slab_nid(slab));
> +
> +		if (!spin_trylock_irqsave(&n->list_lock, flags)) {
> +			/* Unlucky, discard newly allocated slab */
> +			defer_deactivate_slab(slab, NULL);
> +			return 0;
> +		}
> +	}
> +
> +	object = slab->freelist;
> +	while (object && allocated < count) {
> +		p[allocated] = object;
> +		object = get_freepointer(s, object);
> +		maybe_wipe_obj_freeptr(s, p[allocated]);
> +
> +		slab->inuse++;
> +		allocated++;
> +	}
> +	slab->freelist = object;
> +
> +	if (slab->freelist) {
> +
> +		if (allow_spin) {
> +			n = get_node(s, slab_nid(slab));
> +			spin_lock_irqsave(&n->list_lock, flags);
> +		}
> +		add_partial(n, slab, DEACTIVATE_TO_HEAD);
> +		spin_unlock_irqrestore(&n->list_lock, flags);
> +	}
> +
> +	inc_slabs_node(s, slab_nid(slab), slab->objects);

Maybe add a comment explaining why inc_slabs_node() doesn't need to be
called under n->list_lock?

> +	return allocated;
> +}
> +
>  /*
>   * Slow path. The lockless freelist is empty or we need to perform
>   * debugging duties.
> @@ -5388,6 +5519,9 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
>  	return ret;
>  }
>  
> +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
> +				   size_t size, void **p);
> +
>  /*
>   * returns a sheaf that has at least the requested size
>   * when prefilling is needed, do so with given gfp flags
> @@ -7463,6 +7597,116 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
>  }
>  EXPORT_SYMBOL(kmem_cache_free_bulk);
>  
> +static unsigned int
> +__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> +		 unsigned int max)
> +{
> +	struct slab *slab, *slab2;
> +	struct partial_context pc;
> +	unsigned int refilled = 0;
> +	unsigned long flags;
> +	void *object;
> +	int node;
> +
> +	pc.flags = gfp;
> +	pc.min_objects = min;
> +	pc.max_objects = max;
> +
> +	node = numa_mem_id();
> +
> +	if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
> +		return 0;
> +
> +	/* TODO: consider also other nodes? */
> +	if (!get_partial_node_bulk(s, get_node(s, node), &pc))
> +		goto new_slab;
> +
> +	list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
> +
> +		list_del(&slab->slab_list);

When a slab is removed from the list,

> +		object = get_freelist_nofreeze(s, slab);
> +
> +		while (object && refilled < max) {
> +			p[refilled] = object;
> +			object = get_freepointer(s, object);
> +			maybe_wipe_obj_freeptr(s, p[refilled]);
> +
> +			refilled++;
> +		}
> +
> +		/*
> +		 * Freelist had more objects than we can accommodate, we need to
> +		 * free them back. We can treat it like a detached freelist, just
> +		 * need to find the tail object.
> +		 */
> +		if (unlikely(object)) {

And the freelist had more objects than requested,

> +			void *head = object;
> +			void *tail;
> +			int cnt = 0;
> +
> +			do {
> +				tail = object;
> +				cnt++;
> +				object = get_freepointer(s, object);
> +			} while (object);
> +			do_slab_free(s, slab, head, tail, cnt, _RET_IP_);

objects are freed to the slab but the slab may or may not be added back to
n->partial?

> +		}
> +
> +		if (refilled >= max)
> +			break;
> +	}
> +
> +	if (unlikely(!list_empty(&pc.slabs))) {
> +		struct kmem_cache_node *n = get_node(s, node);
> +
> +		spin_lock_irqsave(&n->list_lock, flags);
> +
> +		list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
> +
> +			if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial))
> +				continue;
> +
> +			list_del(&slab->slab_list);
> +			add_partial(n, slab, DEACTIVATE_TO_HEAD);
> +		}
> +
> +		spin_unlock_irqrestore(&n->list_lock, flags);
> +
> +		/* any slabs left are completely free and for discard */
> +		list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
> +
> +			list_del(&slab->slab_list);
> +			discard_slab(s, slab);
> +		}
> +	}
> +
> +
> +	if (likely(refilled >= min))
> +		goto out;
> +
> +new_slab:
> +
> +	slab = new_slab(s, pc.flags, node);
> +	if (!slab)
> +		goto out;
> +
> +	stat(s, ALLOC_SLAB);
> +
> +	/*
> +	 * TODO: possible optimization - if we know we will consume the whole
> +	 * slab we might skip creating the freelist?
> +	 */
> +	refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled,
> +					/* allow_spin = */ true);
> +
> +	if (refilled < min)
> +		goto new_slab;

It should jump to out: label when alloc_from_new_slab() returns zero
(trylock failed).

...Oh wait, no. I was confused.

Why does alloc_from_new_slab() handle !allow_spin case when it cannot be
called if allow_spin is false?

> +out:
> +
> +	return refilled;
> +}

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Vlastimil Babka 3 weeks, 1 day ago

On 1/19/26 07:41, Harry Yoo wrote:
> On Fri, Jan 16, 2026 at 03:40:29PM +0100, Vlastimil Babka wrote:
>> At this point we have sheaves enabled for all caches, but their refill
>> is done via __kmem_cache_alloc_bulk() which relies on cpu (partial)
>> slabs - now a redundant caching layer that we are about to remove.
>> 
>> The refill will thus be done from slabs on the node partial list.
>> Introduce new functions that can do that in an optimized way as it's
>> easier than modifying the __kmem_cache_alloc_bulk() call chain.
>> 
>> Extend struct partial_context so it can return a list of slabs from the
>> partial list with the sum of free objects in them within the requested
>> min and max.
>> 
>> Introduce get_partial_node_bulk() that removes the slabs from freelist
>> and returns them in the list.
>> 
>> Introduce get_freelist_nofreeze() which grabs the freelist without
>> freezing the slab.
>> 
>> Introduce alloc_from_new_slab() which can allocate multiple objects from
>> a newly allocated slab where we don't need to synchronize with freeing.
>> In some aspects it's similar to alloc_single_from_new_slab() but assumes
>> the cache is a non-debug one so it can avoid some actions.
>> 
>> Introduce __refill_objects() that uses the functions above to fill an
>> array of objects. It has to handle the possibility that the slabs will
>> contain more objects that were requested, due to concurrent freeing of
>> objects to those slabs. When no more slabs on partial lists are
>> available, it will allocate new slabs. It is intended to be only used
>> in context where spinning is allowed, so add a WARN_ON_ONCE check there.
>> 
>> Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are
>> only refilled from contexts that allow spinning, or even blocking.
>> 
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> ---
>>  mm/slub.c | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>>  1 file changed, 264 insertions(+), 20 deletions(-)
>> 
>> diff --git a/mm/slub.c b/mm/slub.c
>> index 9bea8a65e510..dce80463f92c 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -3522,6 +3525,63 @@ static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
>>  #endif
>>  static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
>>  
>> +static bool get_partial_node_bulk(struct kmem_cache *s,
>> +				  struct kmem_cache_node *n,
>> +				  struct partial_context *pc)
>> +{
>> +	struct slab *slab, *slab2;
>> +	unsigned int total_free = 0;
>> +	unsigned long flags;
>> +
>> +	/* Racy check to avoid taking the lock unnecessarily. */
>> +	if (!n || data_race(!n->nr_partial))
>> +		return false;
>> +
>> +	INIT_LIST_HEAD(&pc->slabs);
>> +
>> +	spin_lock_irqsave(&n->list_lock, flags);
>> +
>> +	list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
>> +		struct freelist_counters flc;
>> +		unsigned int slab_free;
>> +
>> +		if (!pfmemalloc_match(slab, pc->flags))
>> +			continue;
>> +		/*
>> +		 * determine the number of free objects in the slab racily
>> +		 *
>> +		 * due to atomic updates done by a racing free we should not
>> +		 * read an inconsistent value here, but do a sanity check anyway
>> +		 *
>> +		 * slab_free is a lower bound due to subsequent concurrent
>> +		 * freeing, the caller might get more objects than requested and
>> +		 * must deal with it
>> +		 */
>> +		flc.counters = data_race(READ_ONCE(slab->counters));
>> +		slab_free = flc.objects - flc.inuse;
>> +
>> +		if (unlikely(slab_free > oo_objects(s->oo)))
>> +			continue;
> 
> When is this condition supposed to be true?
> 
> I guess it's when __update_freelist_slow() doesn't update
> slab->counters atomically?

Yeah. Probably could be solvable with WRITE_ONCE() there, as this is only
about hypothetical read/write tearing, not seeing stale values. Or not? Just
wanted to be careful.

>> +
>> +		/* we have already min and this would get us over the max */
>> +		if (total_free >= pc->min_objects
>> +		    && total_free + slab_free > pc->max_objects)
>> +			break;
>> +
>> +		remove_partial(n, slab);
>> +
>> +		list_add(&slab->slab_list, &pc->slabs);
>> +
>> +		total_free += slab_free;
>> +		if (total_free >= pc->max_objects)
>> +			break;
>> +	}
>> +
>> +	spin_unlock_irqrestore(&n->list_lock, flags);
>> +	return total_free > 0;
>> +}
>> +
>>  /*
>>   * Try to allocate a partial slab from a specific node.
>>   */
>> +static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
>> +		void **p, unsigned int count, bool allow_spin)
>> +{
>> +	unsigned int allocated = 0;
>> +	struct kmem_cache_node *n;
>> +	unsigned long flags;
>> +	void *object;
>> +
>> +	if (!allow_spin && (slab->objects - slab->inuse) > count) {
>> +
>> +		n = get_node(s, slab_nid(slab));
>> +
>> +		if (!spin_trylock_irqsave(&n->list_lock, flags)) {
>> +			/* Unlucky, discard newly allocated slab */
>> +			defer_deactivate_slab(slab, NULL);
>> +			return 0;
>> +		}
>> +	}
>> +
>> +	object = slab->freelist;
>> +	while (object && allocated < count) {
>> +		p[allocated] = object;
>> +		object = get_freepointer(s, object);
>> +		maybe_wipe_obj_freeptr(s, p[allocated]);
>> +
>> +		slab->inuse++;
>> +		allocated++;
>> +	}
>> +	slab->freelist = object;
>> +
>> +	if (slab->freelist) {
>> +
>> +		if (allow_spin) {
>> +			n = get_node(s, slab_nid(slab));
>> +			spin_lock_irqsave(&n->list_lock, flags);
>> +		}
>> +		add_partial(n, slab, DEACTIVATE_TO_HEAD);
>> +		spin_unlock_irqrestore(&n->list_lock, flags);
>> +	}
>> +
>> +	inc_slabs_node(s, slab_nid(slab), slab->objects);
> 
> Maybe add a comment explaining why inc_slabs_node() doesn't need to be
> called under n->list_lock?

Hm, we might not even be holding it. The old code also did the inc with no
comment. If anything could use one, it would be in
alloc_single_from_new_slab()? But that's outside the scope here.

>> +	return allocated;
>> +}
>> +
>>  /*
>>   * Slow path. The lockless freelist is empty or we need to perform
>>   * debugging duties.

>> +new_slab:
>> +
>> +	slab = new_slab(s, pc.flags, node);
>> +	if (!slab)
>> +		goto out;
>> +
>> +	stat(s, ALLOC_SLAB);
>> +
>> +	/*
>> +	 * TODO: possible optimization - if we know we will consume the whole
>> +	 * slab we might skip creating the freelist?
>> +	 */
>> +	refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled,
>> +					/* allow_spin = */ true);
>> +
>> +	if (refilled < min)
>> +		goto new_slab;
> 
> It should jump to out: label when alloc_from_new_slab() returns zero
> (trylock failed).
> 
> ...Oh wait, no. I was confused.
> 
> Why does alloc_from_new_slab() handle !allow_spin case when it cannot be
> called if allow_spin is false?

The next patch will use it so it seemed easier to add it already. I'll note
in the commit log.

>> +out:
>> +
>> +	return refilled;
>> +}
>

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Harry Yoo 3 weeks ago

On Mon, Jan 19, 2026 at 11:54:18AM +0100, Vlastimil Babka wrote:
> On 1/19/26 07:41, Harry Yoo wrote:
> > On Fri, Jan 16, 2026 at 03:40:29PM +0100, Vlastimil Babka wrote:
> >> At this point we have sheaves enabled for all caches, but their refill
> >> is done via __kmem_cache_alloc_bulk() which relies on cpu (partial)
> >> slabs - now a redundant caching layer that we are about to remove.
> >> 
> >> The refill will thus be done from slabs on the node partial list.
> >> Introduce new functions that can do that in an optimized way as it's
> >> easier than modifying the __kmem_cache_alloc_bulk() call chain.
> >> 
> >> Extend struct partial_context so it can return a list of slabs from the
> >> partial list with the sum of free objects in them within the requested
> >> min and max.
> >> 
> >> Introduce get_partial_node_bulk() that removes the slabs from freelist
> >> and returns them in the list.
> >> 
> >> Introduce get_freelist_nofreeze() which grabs the freelist without
> >> freezing the slab.
> >> 
> >> Introduce alloc_from_new_slab() which can allocate multiple objects from
> >> a newly allocated slab where we don't need to synchronize with freeing.
> >> In some aspects it's similar to alloc_single_from_new_slab() but assumes
> >> the cache is a non-debug one so it can avoid some actions.
> >> 
> >> Introduce __refill_objects() that uses the functions above to fill an
> >> array of objects. It has to handle the possibility that the slabs will
> >> contain more objects that were requested, due to concurrent freeing of
> >> objects to those slabs. When no more slabs on partial lists are
> >> available, it will allocate new slabs. It is intended to be only used
> >> in context where spinning is allowed, so add a WARN_ON_ONCE check there.
> >> 
> >> Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are
> >> only refilled from contexts that allow spinning, or even blocking.
> >> 
> >> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >> ---
> >>  mm/slub.c | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
> >>  1 file changed, 264 insertions(+), 20 deletions(-)
> >> 
> >> diff --git a/mm/slub.c b/mm/slub.c
> >> index 9bea8a65e510..dce80463f92c 100644
> >> --- a/mm/slub.c
> >> +++ b/mm/slub.c
> >> @@ -3522,6 +3525,63 @@ static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
> >>  #endif
> >>  static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
> >>  
> >> +static bool get_partial_node_bulk(struct kmem_cache *s,
> >> +				  struct kmem_cache_node *n,
> >> +				  struct partial_context *pc)
> >> +{
> >> +	struct slab *slab, *slab2;
> >> +	unsigned int total_free = 0;
> >> +	unsigned long flags;
> >> +
> >> +	/* Racy check to avoid taking the lock unnecessarily. */
> >> +	if (!n || data_race(!n->nr_partial))
> >> +		return false;
> >> +
> >> +	INIT_LIST_HEAD(&pc->slabs);
> >> +
> >> +	spin_lock_irqsave(&n->list_lock, flags);
> >> +
> >> +	list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
> >> +		struct freelist_counters flc;
> >> +		unsigned int slab_free;
> >> +
> >> +		if (!pfmemalloc_match(slab, pc->flags))
> >> +			continue;
> >> +		/*
> >> +		 * determine the number of free objects in the slab racily
> >> +		 *
> >> +		 * due to atomic updates done by a racing free we should not
> >> +		 * read an inconsistent value here, but do a sanity check anyway
> >> +		 *
> >> +		 * slab_free is a lower bound due to subsequent concurrent
> >> +		 * freeing, the caller might get more objects than requested and
> >> +		 * must deal with it
> >> +		 */
> >> +		flc.counters = data_race(READ_ONCE(slab->counters));
> >> +		slab_free = flc.objects - flc.inuse;
> >> +
> >> +		if (unlikely(slab_free > oo_objects(s->oo)))
> >> +			continue;
> > 
> > When is this condition supposed to be true?
> > 
> > I guess it's when __update_freelist_slow() doesn't update
> > slab->counters atomically?
> 
> Yeah. Probably could be solvable with WRITE_ONCE() there, as this is only
> about hypothetical read/write tearing, not seeing stale values.

Ok. That's less confusing than "we should not read an inconsistent value
here, but do a sanity check anyway".

> >> +
> >> +		/* we have already min and this would get us over the max */
> >> +		if (total_free >= pc->min_objects
> >> +		    && total_free + slab_free > pc->max_objects)
> >> +			break;
> >> +
> >> +		remove_partial(n, slab);
> >> +
> >> +		list_add(&slab->slab_list, &pc->slabs);
> >> +
> >> +		total_free += slab_free;
> >> +		if (total_free >= pc->max_objects)
> >> +			break;
> >> +	}
> >> +
> >> +	spin_unlock_irqrestore(&n->list_lock, flags);
> >> +	return total_free > 0;
> >> +}
> >> +
> >>  /*
> >>   * Try to allocate a partial slab from a specific node.
> >>   */
> >> +static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
> >> +		void **p, unsigned int count, bool allow_spin)
> >> +{
> >> +	unsigned int allocated = 0;
> >> +	struct kmem_cache_node *n;
> >> +	unsigned long flags;
> >> +	void *object;
> >> +
> >> +	if (!allow_spin && (slab->objects - slab->inuse) > count) {
> >> +
> >> +		n = get_node(s, slab_nid(slab));
> >> +
> >> +		if (!spin_trylock_irqsave(&n->list_lock, flags)) {
> >> +			/* Unlucky, discard newly allocated slab */
> >> +			defer_deactivate_slab(slab, NULL);
> >> +			return 0;
> >> +		}
> >> +	}
> >> +
> >> +	object = slab->freelist;
> >> +	while (object && allocated < count) {
> >> +		p[allocated] = object;
> >> +		object = get_freepointer(s, object);
> >> +		maybe_wipe_obj_freeptr(s, p[allocated]);
> >> +
> >> +		slab->inuse++;
> >> +		allocated++;
> >> +	}
> >> +	slab->freelist = object;
> >> +
> >> +	if (slab->freelist) {
> >> +
> >> +		if (allow_spin) {
> >> +			n = get_node(s, slab_nid(slab));
> >> +			spin_lock_irqsave(&n->list_lock, flags);
> >> +		}
> >> +		add_partial(n, slab, DEACTIVATE_TO_HEAD);
> >> +		spin_unlock_irqrestore(&n->list_lock, flags);
> >> +	}
> >> +
> >> +	inc_slabs_node(s, slab_nid(slab), slab->objects);
> > 
> > Maybe add a comment explaining why inc_slabs_node() doesn't need to be
> > called under n->list_lock?
> 
> Hm, we might not even be holding it. The old code also did the inc with no
> comment. If anything could use one, it would be in
> alloc_single_from_new_slab()? But that's outside the scope here.

Ok. Perhaps worth adding something like this later, but yeah it's outside
the scope here.

diff --git a/mm/slub.c b/mm/slub.c
index 698c0d940f06..c5a1e47dfe16 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1633,6 +1633,9 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
 {
 	struct kmem_cache_node *n = get_node(s, node);
 
+	if (kmem_cache_debug(s))
+		/* slab validation may generate false errors without the lock */
+		lockdep_assert_held(&n->list_lock);
 	atomic_long_inc(&n->nr_slabs);
 	atomic_long_add(objects, &n->total_objects);
 }


-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Hao Li 3 weeks ago

On Tue, Jan 20, 2026 at 10:41:37AM +0900, Harry Yoo wrote:
> On Mon, Jan 19, 2026 at 11:54:18AM +0100, Vlastimil Babka wrote:
> > On 1/19/26 07:41, Harry Yoo wrote:
> > > On Fri, Jan 16, 2026 at 03:40:29PM +0100, Vlastimil Babka wrote:
> > >>  /*
> > >>   * Try to allocate a partial slab from a specific node.
> > >>   */
> > >> +static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
> > >> +		void **p, unsigned int count, bool allow_spin)
> > >> +{
> > >> +	unsigned int allocated = 0;
> > >> +	struct kmem_cache_node *n;
> > >> +	unsigned long flags;
> > >> +	void *object;
> > >> +
> > >> +	if (!allow_spin && (slab->objects - slab->inuse) > count) {
> > >> +
> > >> +		n = get_node(s, slab_nid(slab));
> > >> +
> > >> +		if (!spin_trylock_irqsave(&n->list_lock, flags)) {
> > >> +			/* Unlucky, discard newly allocated slab */
> > >> +			defer_deactivate_slab(slab, NULL);
> > >> +			return 0;
> > >> +		}
> > >> +	}
> > >> +
> > >> +	object = slab->freelist;
> > >> +	while (object && allocated < count) {
> > >> +		p[allocated] = object;
> > >> +		object = get_freepointer(s, object);
> > >> +		maybe_wipe_obj_freeptr(s, p[allocated]);
> > >> +
> > >> +		slab->inuse++;
> > >> +		allocated++;
> > >> +	}
> > >> +	slab->freelist = object;
> > >> +
> > >> +	if (slab->freelist) {
> > >> +
> > >> +		if (allow_spin) {
> > >> +			n = get_node(s, slab_nid(slab));
> > >> +			spin_lock_irqsave(&n->list_lock, flags);
> > >> +		}
> > >> +		add_partial(n, slab, DEACTIVATE_TO_HEAD);
> > >> +		spin_unlock_irqrestore(&n->list_lock, flags);
> > >> +	}
> > >> +
> > >> +	inc_slabs_node(s, slab_nid(slab), slab->objects);
> > > 
> > > Maybe add a comment explaining why inc_slabs_node() doesn't need to be
> > > called under n->list_lock?

I think this is a great observation.

> > 
> > Hm, we might not even be holding it. The old code also did the inc with no
> > comment. If anything could use one, it would be in
> > alloc_single_from_new_slab()? But that's outside the scope here.
> 
> Ok. Perhaps worth adding something like this later, but yeah it's outside
> the scope here.
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 698c0d940f06..c5a1e47dfe16 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1633,6 +1633,9 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
>  {
>  	struct kmem_cache_node *n = get_node(s, node);
>  
> +	if (kmem_cache_debug(s))
> +		/* slab validation may generate false errors without the lock */
> +		lockdep_assert_held(&n->list_lock);
>  	atomic_long_inc(&n->nr_slabs);
>  	atomic_long_add(objects, &n->total_objects);
>  }

Yes. This makes sense to me.

Just to double-check - I noticed that inc_slabs_node() is also called by
early_kmem_cache_node_alloc(). Could this potentially lead to false positive
warnings for boot-time caches when debug flags are enabled?

-- 
Thanks,
Hao

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Harry Yoo 3 weeks ago

On Tue, Jan 20, 2026 at 05:32:37PM +0800, Hao Li wrote:
> On Tue, Jan 20, 2026 at 10:41:37AM +0900, Harry Yoo wrote:
> > On Mon, Jan 19, 2026 at 11:54:18AM +0100, Vlastimil Babka wrote:
> > > On 1/19/26 07:41, Harry Yoo wrote:
> > > > On Fri, Jan 16, 2026 at 03:40:29PM +0100, Vlastimil Babka wrote:
> > > >>  /*
> > > >>   * Try to allocate a partial slab from a specific node.
> > > >>   */
> > > >> +static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
> > > >> +		void **p, unsigned int count, bool allow_spin)
> > > >> +{
> > > >> +	unsigned int allocated = 0;
> > > >> +	struct kmem_cache_node *n;
> > > >> +	unsigned long flags;
> > > >> +	void *object;
> > > >> +
> > > >> +	if (!allow_spin && (slab->objects - slab->inuse) > count) {
> > > >> +
> > > >> +		n = get_node(s, slab_nid(slab));
> > > >> +
> > > >> +		if (!spin_trylock_irqsave(&n->list_lock, flags)) {
> > > >> +			/* Unlucky, discard newly allocated slab */
> > > >> +			defer_deactivate_slab(slab, NULL);
> > > >> +			return 0;
> > > >> +		}
> > > >> +	}
> > > >> +
> > > >> +	object = slab->freelist;
> > > >> +	while (object && allocated < count) {
> > > >> +		p[allocated] = object;
> > > >> +		object = get_freepointer(s, object);
> > > >> +		maybe_wipe_obj_freeptr(s, p[allocated]);
> > > >> +
> > > >> +		slab->inuse++;
> > > >> +		allocated++;
> > > >> +	}
> > > >> +	slab->freelist = object;
> > > >> +
> > > >> +	if (slab->freelist) {
> > > >> +
> > > >> +		if (allow_spin) {
> > > >> +			n = get_node(s, slab_nid(slab));
> > > >> +			spin_lock_irqsave(&n->list_lock, flags);
> > > >> +		}
> > > >> +		add_partial(n, slab, DEACTIVATE_TO_HEAD);
> > > >> +		spin_unlock_irqrestore(&n->list_lock, flags);
> > > >> +	}
> > > >> +
> > > >> +	inc_slabs_node(s, slab_nid(slab), slab->objects);
> > > > 
> > > > Maybe add a comment explaining why inc_slabs_node() doesn't need to be
> > > > called under n->list_lock?
> 
> I think this is a great observation.
> 
> > > 
> > > Hm, we might not even be holding it. The old code also did the inc with no
> > > comment. If anything could use one, it would be in
> > > alloc_single_from_new_slab()? But that's outside the scope here.
> > 
> > Ok. Perhaps worth adding something like this later, but yeah it's outside
> > the scope here.
> > 
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 698c0d940f06..c5a1e47dfe16 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -1633,6 +1633,9 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
> >  {
> >  	struct kmem_cache_node *n = get_node(s, node);
> >  
> > +	if (kmem_cache_debug(s))
> > +		/* slab validation may generate false errors without the lock */
> > +		lockdep_assert_held(&n->list_lock);
> >  	atomic_long_inc(&n->nr_slabs);
> >  	atomic_long_add(objects, &n->total_objects);
> >  }
> 
> Yes. This makes sense to me.
> 
> Just to double-check - I noticed that inc_slabs_node() is also called by
> early_kmem_cache_node_alloc(). Could this potentially lead to false positive
> warnings for boot-time caches when debug flags are enabled?

Good point. Perhaps the condition should be

if ((slab_state != DOWN) && kmem_cache_debug(s))

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v3 09/21] slab: add optimized sheaf refill from partial list

Posted by Harry Yoo 3 weeks, 1 day ago

On Mon, Jan 19, 2026 at 03:41:40PM +0900, Harry Yoo wrote:
> On Fri, Jan 16, 2026 at 03:40:29PM +0100, Vlastimil Babka wrote:
> > At this point we have sheaves enabled for all caches, but their refill
> > is done via __kmem_cache_alloc_bulk() which relies on cpu (partial)
> > slabs - now a redundant caching layer that we are about to remove.
> > 
> > The refill will thus be done from slabs on the node partial list.
> > Introduce new functions that can do that in an optimized way as it's
> > easier than modifying the __kmem_cache_alloc_bulk() call chain.
> > 
> > Extend struct partial_context so it can return a list of slabs from the
> > partial list with the sum of free objects in them within the requested
> > min and max.
> > 
> > Introduce get_partial_node_bulk() that removes the slabs from freelist
> > and returns them in the list.
> > 
> > Introduce get_freelist_nofreeze() which grabs the freelist without
> > freezing the slab.
> > 
> > Introduce alloc_from_new_slab() which can allocate multiple objects from
> > a newly allocated slab where we don't need to synchronize with freeing.
> > In some aspects it's similar to alloc_single_from_new_slab() but assumes
> > the cache is a non-debug one so it can avoid some actions.
> > 
> > Introduce __refill_objects() that uses the functions above to fill an
> > array of objects. It has to handle the possibility that the slabs will
> > contain more objects that were requested, due to concurrent freeing of
> > objects to those slabs. When no more slabs on partial lists are
> > available, it will allocate new slabs. It is intended to be only used
> > in context where spinning is allowed, so add a WARN_ON_ONCE check there.
> > 
> > Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are
> > only refilled from contexts that allow spinning, or even blocking.
> > 
> > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> > ---
> >  mm/slub.c | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
> >  1 file changed, 264 insertions(+), 20 deletions(-)
> > 
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 9bea8a65e510..dce80463f92c 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -7463,6 +7597,116 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
> >  }
> >  EXPORT_SYMBOL(kmem_cache_free_bulk);
> >  
> > +static unsigned int
> > +__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
> > +		 unsigned int max)
> > +{
> > +	struct slab *slab, *slab2;
> > +	struct partial_context pc;
> > +	unsigned int refilled = 0;
> > +	unsigned long flags;
> > +	void *object;
> > +	int node;
> > +
> > +	pc.flags = gfp;
> > +	pc.min_objects = min;
> > +	pc.max_objects = max;
> > +
> > +	node = numa_mem_id();
> > +
> > +	if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
> > +		return 0;
> > +
> > +	/* TODO: consider also other nodes? */
> > +	if (!get_partial_node_bulk(s, get_node(s, node), &pc))
> > +		goto new_slab;
> > +
> > +	list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
> > +
> > +		list_del(&slab->slab_list);
> 
> When a slab is removed from the list,
> 
> > +		object = get_freelist_nofreeze(s, slab);
> > +
> > +		while (object && refilled < max) {
> > +			p[refilled] = object;
> > +			object = get_freepointer(s, object);
> > +			maybe_wipe_obj_freeptr(s, p[refilled]);
> > +
> > +			refilled++;
> > +		}
> > +
> > +		/*
> > +		 * Freelist had more objects than we can accommodate, we need to
> > +		 * free them back. We can treat it like a detached freelist, just
> > +		 * need to find the tail object.
> > +		 */
> > +		if (unlikely(object)) {
> 
> And the freelist had more objects than requested,
> 
> > +			void *head = object;
> > +			void *tail;
> > +			int cnt = 0;
> > +
> > +			do {
> > +				tail = object;
> > +				cnt++;
> > +				object = get_freepointer(s, object);
> > +			} while (object);
> > +			do_slab_free(s, slab, head, tail, cnt, _RET_IP_);
> 
> objects are freed to the slab but the slab may or may not be added back to
> n->partial?

No, since the slab becomes a full slab after get_freelist_nofreeze(),
do_slab_free() should add it back to n->partial list!

-- 
Cheers,
Harry / Hyeonggon