[v2] mm/slub: skip freelist construction for whole-slab bulk refill

[PATCH v2] mm/slub: skip freelist construction for whole-slab bulk refill

Posted by hu.shengming@zte.com.cn 2 hours ago

From: Shengming Hu <hu.shengming@zte.com.cn>

refill_objects() already notes that a whole-slab bulk refill could avoid
building a freelist that would be drained immediately.

When the remaining bulk allocation is large enough to consume an entire
new slab, building the freelist is unnecessary overhead. Instead,
allocate the slab without initializing its freelist and hand all objects
directly to the caller.

Handle CONFIG_SLAB_FREELIST_RANDOM=y as well by walking objects in the
randomized allocation order and placing them directly into the caller's
array, without constructing a temporary freelist.

Also mark setup_object() inline. After this optimization, the compiler no
longer consistently inlines this helper in the hot path, which can hurt
performance. Explicitly marking it inline restores the expected code
generation.

This reduces per-object overhead in bulk allocation paths and improves
allocation throughput significantly. In slub_bulk_bench, the time per
object drops by about 54% to 74% with CONFIG_SLAB_FREELIST_RANDOM=n, and
by about 62% to 74% with CONFIG_SLAB_FREELIST_RANDOM=y.

Benchmark results (slub_bulk_bench):

Machine: qemu-system-x86 -m 1024M -smp 8 -enable-kvm -cpu host
Kernel: Linux 7.0.0-rc6-next-20260330
Config: x86_64_defconfig
Cpu: 0
Rounds: 20
Total: 256MB

- CONFIG_SLAB_FREELIST_RANDOM=n -

obj_size=16, batch=256:
before: 5.29 +- 0.73 ns/object
after: 2.42 +- 0.05 ns/object
delta: -54.4%

obj_size=32, batch=128:
before: 7.65 +- 1.89 ns/object
after: 3.04 +- 0.03 ns/object
delta: -60.2%

obj_size=64, batch=64:
before: 11.07 +- 0.08 ns/object
after: 4.11 +- 0.04 ns/object
delta: -62.9%

obj_size=128, batch=32:
before: 19.95 +- 0.30 ns/object
after: 5.72 +- 0.05 ns/object
delta: -71.3%

obj_size=256, batch=32:
before: 24.31 +- 0.25 ns/object
after: 6.33 +- 0.14 ns/object
delta: -74.0%

obj_size=512, batch=32:
before: 22.48 +- 0.14 ns/object
after: 6.43 +- 0.10 ns/object
delta: -71.4%

- CONFIG_SLAB_FREELIST_RANDOM=y -

obj_size=16, batch=256:
before: 9.32 +- 1.26 ns/object
after: 3.51 +- 0.02 ns/object
delta: -62.4%

obj_size=32, batch=128:
before: 11.68 +- 0.15 ns/object
after: 4.18 +- 0.22 ns/object
delta: -64.2%

obj_size=64, batch=64:
before: 16.69 +- 1.36 ns/object
after: 5.22 +- 0.06 ns/object
delta: -68.7%

obj_size=128, batch=32:
before: 23.41 +- 0.23 ns/object
after: 7.40 +- 0.07 ns/object
delta: -68.4%

obj_size=256, batch=32:
before: 29.80 +- 0.44 ns/object
after: 7.98 +- 0.09 ns/object
delta: -73.2%

obj_size=512, batch=32:
before: 30.38 +- 0.36 ns/object
after: 8.01 +- 0.06 ns/object
delta: -73.6%

Link: https://github.com/HSM6236/slub_bulk_test.git
Signed-off-by: Shengming Hu <hu.shengming@zte.com.cn>
---
Changes in v2:
- Handle CONFIG_SLAB_FREELIST_RANDOM=y and add benchmark results.
- Update the QEMU benchmark setup to use -enable-kvm -cpu host so benchmark results better reflect native CPU performance.
- Link to v1: https://lore.kernel.org/all/20260328125538341lvTGRpS62UNdRiAAz2gH3@zte.com.cn/

---
 mm/slub.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 136 insertions(+), 19 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index fb2c5c57bc4e..52da4a716b1b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2733,7 +2733,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
 	return *head != NULL;
 }
 
-static void *setup_object(struct kmem_cache *s, void *object)
+static inline void *setup_object(struct kmem_cache *s, void *object)
 {
 	setup_object_debug(s, object);
 	object = kasan_init_slab_obj(s, object);
@@ -3399,6 +3399,53 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab,
 
 	return true;
 }
+static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
+						   void *obj);
+
+static inline bool alloc_whole_from_new_slab_random(struct kmem_cache *s,
+					struct slab *slab, void **p,
+					bool allow_spin,
+					unsigned int *allocatedp)
+{
+	unsigned long pos, page_limit, freelist_count;
+	unsigned int allocated = 0;
+	void *next, *start;
+
+	if (slab->objects < 2 || !s->random_seq)
+		return false;
+
+	freelist_count = oo_objects(s->oo);
+
+	if (allow_spin) {
+		pos = get_random_u32_below(freelist_count);
+	} else {
+		struct rnd_state *state;
+
+		/*
+		 * An interrupt or NMI handler might interrupt and change
+		 * the state in the middle, but that's safe.
+		 */
+		state = &get_cpu_var(slab_rnd_state);
+		pos = prandom_u32_state(state) % freelist_count;
+		put_cpu_var(slab_rnd_state);
+	}
+
+	page_limit = slab->objects * s->size;
+	start = fixup_red_left(s, slab_address(slab));
+
+	while (allocated < slab->objects) {
+		next = next_freelist_entry(s, &pos, start, page_limit,
+					     freelist_count);
+		next = setup_object(s, next);
+		p[allocated] = next;
+		maybe_wipe_obj_freeptr(s, next);
+		allocated++;
+	}
+
+	*allocatedp = allocated;
+	return true;
+}
+
 #else
 static inline int init_cache_random_seq(struct kmem_cache *s)
 {
@@ -3410,6 +3457,14 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab,
 {
 	return false;
 }
+
+static inline bool alloc_whole_from_new_slab_random(struct kmem_cache *s,
+					struct slab *slab, void **p,
+					bool allow_spin,
+					unsigned int *allocatedp)
+{
+	return false;
+}
 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
 
 static __always_inline void account_slab(struct slab *slab, int order,
@@ -3438,7 +3493,8 @@ static __always_inline void unaccount_slab(struct slab *slab, int order,
 			    -(PAGE_SIZE << order));
 }
 
-static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
+static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node,
+				  bool build_freelist, bool *allow_spinp)
 {
 	bool allow_spin = gfpflags_allow_spinning(flags);
 	struct slab *slab;
@@ -3446,7 +3502,10 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	gfp_t alloc_gfp;
 	void *start, *p, *next;
 	int idx;
-	bool shuffle;
+	bool shuffle = false;
+
+	if (allow_spinp)
+		*allow_spinp = allow_spin;
 
 	flags &= gfp_allowed_mask;
 
@@ -3483,6 +3542,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	slab->frozen = 0;
 
 	slab->slab_cache = s;
+	slab->freelist = NULL;
 
 	kasan_poison_slab(slab);
 
@@ -3497,9 +3557,10 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	alloc_slab_obj_exts_early(s, slab);
 	account_slab(slab, oo_order(oo), s, flags);
 
-	shuffle = shuffle_freelist(s, slab, allow_spin);
+	if (build_freelist)
+		shuffle = shuffle_freelist(s, slab, allow_spin);
 
-	if (!shuffle) {
+	if (build_freelist && !shuffle) {
 		start = fixup_red_left(s, start);
 		start = setup_object(s, start);
 		slab->freelist = start;
@@ -3515,7 +3576,8 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	return slab;
 }
 
-static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node,
+			     bool build_freelist, bool *allow_spinp)
 {
 	if (unlikely(flags & GFP_SLAB_BUG_MASK))
 		flags = kmalloc_fix_flags(flags);
@@ -3523,7 +3585,8 @@ static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
 
 	return allocate_slab(s,
-		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
+			     flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK),
+			     node, build_freelist, allow_spinp);
 }
 
 static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin)
@@ -4395,6 +4458,48 @@ static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
 	return allocated;
 }
 
+static unsigned int alloc_whole_from_new_slab(struct kmem_cache *s,
+		struct slab *slab, void **p, bool allow_spin)
+{
+
+	unsigned int allocated = 0;
+	void *object, *start;
+
+	if (alloc_whole_from_new_slab_random(s, slab, p, allow_spin,
+					     &allocated)) {
+		goto done;
+	}
+
+	start = fixup_red_left(s, slab_address(slab));
+	object = setup_object(s, start);
+
+	while (allocated < slab->objects - 1) {
+		p[allocated] = object;
+		maybe_wipe_obj_freeptr(s, object);
+
+		allocated++;
+		object += s->size;
+		object = setup_object(s, object);
+	}
+
+	p[allocated] = object;
+	maybe_wipe_obj_freeptr(s, object);
+	allocated++;
+
+done:
+	slab->freelist = NULL;
+	slab->inuse = slab->objects;
+	inc_slabs_node(s, slab_nid(slab), slab->objects);
+
+	return allocated;
+}
+
+static inline bool bulk_refill_consumes_whole_slab(struct kmem_cache *s,
+		unsigned int count)
+{
+	return count >= oo_objects(s->oo);
+}
+
 /*
  * Slow path. We failed to allocate via percpu sheaves or they are not available
  * due to bootstrap or debugging enabled or SLUB_TINY.
@@ -4441,7 +4546,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	if (object)
 		goto success;
 
-	slab = new_slab(s, pc.flags, node);
+	slab = new_slab(s, pc.flags, node, true, NULL);
 
 	if (unlikely(!slab)) {
 		if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
@@ -7244,18 +7349,30 @@ refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
 
 new_slab:
 
-	slab = new_slab(s, gfp, local_node);
-	if (!slab)
-		goto out;
-
-	stat(s, ALLOC_SLAB);
-
 	/*
-	 * TODO: possible optimization - if we know we will consume the whole
-	 * slab we might skip creating the freelist?
+	 * If the remaining bulk allocation is large enough to consume
+	 * an entire slab, avoid building the freelist only to drain it
+	 * immediately. Instead, allocate a slab without a freelist and
+	 * hand out all objects directly.
 	 */
-	refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled,
-					/* allow_spin = */ true);
+	if (bulk_refill_consumes_whole_slab(s, max - refilled)) {
+		bool allow_spin;
+
+		slab = new_slab(s, gfp, local_node, false, &allow_spin);
+		if (!slab)
+			goto out;
+		stat(s, ALLOC_SLAB);
+		refilled += alloc_whole_from_new_slab(s, slab, p + refilled,
+						      allow_spin);
+	} else {
+		slab = new_slab(s, gfp, local_node, true, NULL);
+		if (!slab)
+			goto out;
+		stat(s, ALLOC_SLAB);
+		refilled += alloc_from_new_slab(s, slab, p + refilled,
+						max - refilled,
+						/* allow_spin = */ true);
+	}
 
 	if (refilled < min)
 		goto new_slab;
@@ -7587,7 +7704,7 @@ static void early_kmem_cache_node_alloc(int node)
 
 	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
 
-	slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
+	slab = new_slab(kmem_cache_node, GFP_NOWAIT, node, true, NULL);
 
 	BUG_ON(!slab);
 	if (slab_nid(slab) != node) {
-- 
2.25.1

Re: [PATCH v2] mm/slub: skip freelist construction for whole-slab bulk refill

Posted by Hao Li 16 minutes ago

On Wed, Apr 01, 2026 at 12:57:25PM +0800, hu.shengming@zte.com.cn wrote:
> From: Shengming Hu <hu.shengming@zte.com.cn>
> 
> refill_objects() already notes that a whole-slab bulk refill could avoid
> building a freelist that would be drained immediately.
> 
> When the remaining bulk allocation is large enough to consume an entire
> new slab, building the freelist is unnecessary overhead. Instead,
> allocate the slab without initializing its freelist and hand all objects
> directly to the caller.
> 
> Handle CONFIG_SLAB_FREELIST_RANDOM=y as well by walking objects in the
> randomized allocation order and placing them directly into the caller's
> array, without constructing a temporary freelist.
> 
> Also mark setup_object() inline. After this optimization, the compiler no
> longer consistently inlines this helper in the hot path, which can hurt
> performance. Explicitly marking it inline restores the expected code
> generation.
> 
> This reduces per-object overhead in bulk allocation paths and improves
> allocation throughput significantly. In slub_bulk_bench, the time per
> object drops by about 54% to 74% with CONFIG_SLAB_FREELIST_RANDOM=n, and
> by about 62% to 74% with CONFIG_SLAB_FREELIST_RANDOM=y.

Thanks for the patch.
Here are some quick review..

> 
> Benchmark results (slub_bulk_bench):
> 
> Machine: qemu-system-x86 -m 1024M -smp 8 -enable-kvm -cpu host
> Kernel: Linux 7.0.0-rc6-next-20260330
> Config: x86_64_defconfig
> Cpu: 0
> Rounds: 20
> Total: 256MB
> 
> - CONFIG_SLAB_FREELIST_RANDOM=n -
> 
> obj_size=16, batch=256:
> before: 5.29 +- 0.73 ns/object
> after: 2.42 +- 0.05 ns/object
> delta: -54.4%
> 
> obj_size=32, batch=128:
> before: 7.65 +- 1.89 ns/object
> after: 3.04 +- 0.03 ns/object
> delta: -60.2%
> 
> obj_size=64, batch=64:
> before: 11.07 +- 0.08 ns/object
> after: 4.11 +- 0.04 ns/object
> delta: -62.9%
> 
> obj_size=128, batch=32:
> before: 19.95 +- 0.30 ns/object
> after: 5.72 +- 0.05 ns/object
> delta: -71.3%
> 
> obj_size=256, batch=32:
> before: 24.31 +- 0.25 ns/object
> after: 6.33 +- 0.14 ns/object
> delta: -74.0%
> 
> obj_size=512, batch=32:
> before: 22.48 +- 0.14 ns/object
> after: 6.43 +- 0.10 ns/object
> delta: -71.4%
> 
> - CONFIG_SLAB_FREELIST_RANDOM=y -
> 
> obj_size=16, batch=256:
> before: 9.32 +- 1.26 ns/object
> after: 3.51 +- 0.02 ns/object
> delta: -62.4%
> 
> obj_size=32, batch=128:
> before: 11.68 +- 0.15 ns/object
> after: 4.18 +- 0.22 ns/object
> delta: -64.2%
> 
> obj_size=64, batch=64:
> before: 16.69 +- 1.36 ns/object
> after: 5.22 +- 0.06 ns/object
> delta: -68.7%
> 
> obj_size=128, batch=32:
> before: 23.41 +- 0.23 ns/object
> after: 7.40 +- 0.07 ns/object
> delta: -68.4%
> 
> obj_size=256, batch=32:
> before: 29.80 +- 0.44 ns/object
> after: 7.98 +- 0.09 ns/object
> delta: -73.2%
> 
> obj_size=512, batch=32:
> before: 30.38 +- 0.36 ns/object
> after: 8.01 +- 0.06 ns/object
> delta: -73.6%
> 
> Link: https://github.com/HSM6236/slub_bulk_test.git
> Signed-off-by: Shengming Hu <hu.shengming@zte.com.cn>
> ---
> Changes in v2:
> - Handle CONFIG_SLAB_FREELIST_RANDOM=y and add benchmark results.
> - Update the QEMU benchmark setup to use -enable-kvm -cpu host so benchmark results better reflect native CPU performance.
> - Link to v1: https://lore.kernel.org/all/20260328125538341lvTGRpS62UNdRiAAz2gH3@zte.com.cn/
> 
> ---
>  mm/slub.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 136 insertions(+), 19 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index fb2c5c57bc4e..52da4a716b1b 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -2733,7 +2733,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
>  	return *head != NULL;
>  }
>  
> -static void *setup_object(struct kmem_cache *s, void *object)
> +static inline void *setup_object(struct kmem_cache *s, void *object)
>  {
>  	setup_object_debug(s, object);
>  	object = kasan_init_slab_obj(s, object);
> @@ -3399,6 +3399,53 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab,
>  
>  	return true;
>  }
> +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
> +						   void *obj);
> +
> +static inline bool alloc_whole_from_new_slab_random(struct kmem_cache *s,
> +					struct slab *slab, void **p,
> +					bool allow_spin,
> +					unsigned int *allocatedp)
> +{
> +	unsigned long pos, page_limit, freelist_count;
> +	unsigned int allocated = 0;
> +	void *next, *start;
> +
> +	if (slab->objects < 2 || !s->random_seq)
> +		return false;
> +
> +	freelist_count = oo_objects(s->oo);
> +
> +	if (allow_spin) {
> +		pos = get_random_u32_below(freelist_count);
> +	} else {
> +		struct rnd_state *state;
> +
> +		/*
> +		 * An interrupt or NMI handler might interrupt and change
> +		 * the state in the middle, but that's safe.
> +		 */
> +		state = &get_cpu_var(slab_rnd_state);
> +		pos = prandom_u32_state(state) % freelist_count;
> +		put_cpu_var(slab_rnd_state);
> +	}
> +
> +	page_limit = slab->objects * s->size;
> +	start = fixup_red_left(s, slab_address(slab));
> +
> +	while (allocated < slab->objects) {
> +		next = next_freelist_entry(s, &pos, start, page_limit,
> +					     freelist_count);
> +		next = setup_object(s, next);
> +		p[allocated] = next;
> +		maybe_wipe_obj_freeptr(s, next);
> +		allocated++;
> +	}
> +
> +	*allocatedp = allocated;

It seems we does not need to return the allocated count through allocatedp,
since the count should always be slab->objects.

> +	return true;
> +}
> +
>  #else
>  static inline int init_cache_random_seq(struct kmem_cache *s)
>  {
> @@ -3410,6 +3457,14 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab,
>  {
>  	return false;
>  }
> +
> +static inline bool alloc_whole_from_new_slab_random(struct kmem_cache *s,
> +					struct slab *slab, void **p,
> +					bool allow_spin,
> +					unsigned int *allocatedp)
> +{
> +	return false;
> +}
>  #endif /* CONFIG_SLAB_FREELIST_RANDOM */
>  
>  static __always_inline void account_slab(struct slab *slab, int order,
> @@ -3438,7 +3493,8 @@ static __always_inline void unaccount_slab(struct slab *slab, int order,
>  			    -(PAGE_SIZE << order));
>  }
>  
> -static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> +static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node,
> +				  bool build_freelist, bool *allow_spinp)
>  {
>  	bool allow_spin = gfpflags_allow_spinning(flags);
>  	struct slab *slab;
> @@ -3446,7 +3502,10 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  	gfp_t alloc_gfp;
>  	void *start, *p, *next;
>  	int idx;
> -	bool shuffle;
> +	bool shuffle = false;
> +
> +	if (allow_spinp)
> +		*allow_spinp = allow_spin;

It seems unnecessary for allocate_slab() to compute allow_spin and return it
via allow_spinp.
We could instead calculate it directly in refill_objects() based on gfp.

>  
>  	flags &= gfp_allowed_mask;
>  
> @@ -3483,6 +3542,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  	slab->frozen = 0;
>  
>  	slab->slab_cache = s;
> +	slab->freelist = NULL;
>  
>  	kasan_poison_slab(slab);
>  
> @@ -3497,9 +3557,10 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  	alloc_slab_obj_exts_early(s, slab);
>  	account_slab(slab, oo_order(oo), s, flags);
>  
> -	shuffle = shuffle_freelist(s, slab, allow_spin);
> +	if (build_freelist)
> +		shuffle = shuffle_freelist(s, slab, allow_spin);
>  
> -	if (!shuffle) {
> +	if (build_freelist && !shuffle) {
>  		start = fixup_red_left(s, start);
>  		start = setup_object(s, start);
>  		slab->freelist = start;
> @@ -3515,7 +3576,8 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  	return slab;
>  }
>  
> -static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
> +static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node,
> +			     bool build_freelist, bool *allow_spinp)
>  {
>  	if (unlikely(flags & GFP_SLAB_BUG_MASK))
>  		flags = kmalloc_fix_flags(flags);
> @@ -3523,7 +3585,8 @@ static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
>  	WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
>  
>  	return allocate_slab(s,
> -		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
> +			     flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK),
> +			     node, build_freelist, allow_spinp);
>  }
>  
>  static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin)
> @@ -4395,6 +4458,48 @@ static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
>  	return allocated;
>  }
>  
> +static unsigned int alloc_whole_from_new_slab(struct kmem_cache *s,
> +		struct slab *slab, void **p, bool allow_spin)
> +{
> +
> +	unsigned int allocated = 0;
> +	void *object, *start;
> +
> +	if (alloc_whole_from_new_slab_random(s, slab, p, allow_spin,
> +					     &allocated)) {
> +		goto done;
> +	}
> +
> +	start = fixup_red_left(s, slab_address(slab));
> +	object = setup_object(s, start);
> +
> +	while (allocated < slab->objects - 1) {
> +		p[allocated] = object;
> +		maybe_wipe_obj_freeptr(s, object);
> +
> +		allocated++;
> +		object += s->size;
> +		object = setup_object(s, object);
> +	}

Also, I feel the current patch contains some duplicated code like this loop.

Would it make sense to split allocate_slab() into two functions?

For example,
the first part could be called allocate_slab_meta_setup() (just an example name)
And, the second part could be allocate_slab_objects_setup(), with the core logic
being the loop over objects. Then allocate_slab_objects_setup() could support
two modes: one called BUILD_FREELIST, which builds the freelist, and another
called EMIT_OBJECTS, which skips building the freelist and directly places the
objects into the target array.

> +
> +	p[allocated] = object;
> +	maybe_wipe_obj_freeptr(s, object);
> +	allocated++;
> +
> +done:
> +	slab->freelist = NULL;
> +	slab->inuse = slab->objects;
> +	inc_slabs_node(s, slab_nid(slab), slab->objects);
> +
> +	return allocated;
> +}
> +
> +static inline bool bulk_refill_consumes_whole_slab(struct kmem_cache *s,
> +		unsigned int count)
> +{
> +	return count >= oo_objects(s->oo);

It seems using s->oo here may be a bit too strict. In allocate_slab(), the
object count can fall back to s->min, so using s->objects might be more
reasonable (If I understand correctly...).

> +}
> +
>  /*
>   * Slow path. We failed to allocate via percpu sheaves or they are not available
>   * due to bootstrap or debugging enabled or SLUB_TINY.
> @@ -4441,7 +4546,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  	if (object)
>  		goto success;
>  
> -	slab = new_slab(s, pc.flags, node);
> +	slab = new_slab(s, pc.flags, node, true, NULL);
>  
>  	if (unlikely(!slab)) {
>  		if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
> @@ -7244,18 +7349,30 @@ refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
>  
>  new_slab:
>  
> -	slab = new_slab(s, gfp, local_node);
> -	if (!slab)
> -		goto out;
> -
> -	stat(s, ALLOC_SLAB);
> -
>  	/*
> -	 * TODO: possible optimization - if we know we will consume the whole
> -	 * slab we might skip creating the freelist?
> +	 * If the remaining bulk allocation is large enough to consume
> +	 * an entire slab, avoid building the freelist only to drain it
> +	 * immediately. Instead, allocate a slab without a freelist and
> +	 * hand out all objects directly.
>  	 */
> -	refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled,
> -					/* allow_spin = */ true);
> +	if (bulk_refill_consumes_whole_slab(s, max - refilled)) {
> +		bool allow_spin;
> +
> +		slab = new_slab(s, gfp, local_node, false, &allow_spin);
> +		if (!slab)
> +			goto out;
> +		stat(s, ALLOC_SLAB);
> +		refilled += alloc_whole_from_new_slab(s, slab, p + refilled,
> +						      allow_spin);
> +	} else {
> +		slab = new_slab(s, gfp, local_node, true, NULL);
> +		if (!slab)
> +			goto out;
> +		stat(s, ALLOC_SLAB);
> +		refilled += alloc_from_new_slab(s, slab, p + refilled,
> +						max - refilled,
> +						/* allow_spin = */ true);
> +	}
>  
>  	if (refilled < min)
>  		goto new_slab;
> @@ -7587,7 +7704,7 @@ static void early_kmem_cache_node_alloc(int node)
>  
>  	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
>  
> -	slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
> +	slab = new_slab(kmem_cache_node, GFP_NOWAIT, node, true, NULL);
>  
>  	BUG_ON(!slab);
>  	if (slab_nid(slab) != node) {
> -- 
> 2.25.1