From nobody Mon Jun  8 06:35:53 2026
Received: from out-171.mta0.migadu.com (out-171.mta0.migadu.com
 [91.218.175.171])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 41F063AA517
	for <linux-kernel@vger.kernel.org>; Mon,  1 Jun 2026 09:57:38 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=91.218.175.171
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1780307868; cv=none;
 b=Hp8CWXyqcridOkBjVqJ1PnLJVj8w1oA+kt/pLw02VLsaqWS5Bz5gCYI7qx6beChYGpnGawSim1UKgJnmlRyN9nz4b6cD5tY18ckelExvAIAOb5m3QirumGlKM/D4d+MpfkLSepNrATOVS+Nm046qVU5z5mggNMRZ87YVBjKTN+U=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1780307868; c=relaxed/simple;
	bh=TdZBGrwTwtVt2lYS2oPAFb2PNDZ9JfeQ1XMyez/Dc+0=;
	h=From:To:Cc:Subject:Date:Message-ID:MIME-Version;
 b=ekIuiJHCUUXWnAIya4xmVtAgBq9uymzddxYoT6D64PYjfCT2IQweTs60+7CZ7yFZU66yZ/7/hegbzAe8e5GAnfdJKCvu3HdUQby7I1IS4YUObwyif7wSXyH4iwribuQrgboNIfl01P15HQR7fxxtCM5Vq83FEB+2jDNg6w7JYPg=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.dev;
 spf=pass smtp.mailfrom=linux.dev;
 dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev
 header.b=V58lmzgU; arc=none smtp.client-ip=91.218.175.171
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.dev
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=linux.dev
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev
 header.b="V58lmzgU"
X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and
 include these headers.
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1;
	t=1780307857;
	h=from:from:reply-to:subject:subject:date:date:message-id:message-id:
	 to:to:cc:cc:mime-version:mime-version:
	 content-transfer-encoding:content-transfer-encoding;
	bh=TUs+rCabK0QoZm/U6SN1B2Cuc1KTvjDd/Qyt/e5rZGA=;
	b=V58lmzgUtTBFOyaGfXipiyAVr59RZsUVgNNYJp3bqPPUEfN38LqAfFHR1rVyJwTfT0BFt5
	EXAv48XYm/bcAE/JRs6DWVxFoJZ5tlLtq5kPu6taQih7tDCMfkUbZtBJztmhwMzbAWIgxF
	IClN6JgqNK0bguRSkvlcnY7ObYmb9vg=
From: Hao Li <hao.li@linux.dev>
To: vbabka@kernel.org,
	harry@kernel.org,
	akpm@linux-foundation.org
Cc: cl@gentwo.org,
	rientjes@google.com,
	roman.gushchin@linux.dev,
	linux-mm@kvack.org,
	linux-kernel@vger.kernel.org,
	Hao Li <hao.li@linux.dev>
Subject: [PATCH v2] mm/slub: allocate sheaves on local memory nodes
Date: Mon,  1 Jun 2026 17:56:21 +0800
Message-ID: <20260601095706.106551-1-hao.li@linux.dev>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Migadu-Flow: FLOW_OUT
Content-Type: text/plain; charset="utf-8"

Sheaf structs are exchanged through node-local barns. Since barn structs
are already allocated from their local NUMA node, this patch aims to
allocate sheaf structs from their local memory nodes as well.

To achieve this, the obvious choice would be using cpu_to_mem().
However, init_percpu_sheaves() and bootstrap_cache_sheaves() iterate
through possible CPUs, whereas cpu_to_mem() is only initialized for
online CPUs. Therefore, we cannot use cpu_to_mem() and instead need to
use local_memory_node(cpu_to_node(cpu)), similar to what
__build_all_zonelists() does.

The primary goal of this patch is to improve NUMA node locality.
Although the actual performance impact is minor, it still yields a ~1%
improvement on a 192-core, 8-NUMA-node system when testing with the
will-it-scale mmap test case.

Signed-off-by: Hao Li <hao.li@linux.dev>
---
Changes in v2:
- Make init_percpu_sheaves() use a NUMA-aware sheaf struct allocation too.
  (Thanks Harry)
- Rebase on latest code.

v1: https://lore.kernel.org/linux-mm/20260525082312.16012-1-hao.li@linux.de=
v/

---
 mm/slub.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index cbf6636a3dad..7d36e09ae216 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2757,7 +2757,7 @@ static inline void *setup_object(struct kmem_cache *s=
, void *object)
 }
=20
 static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t =
gfp,
-					      unsigned int capacity)
+					      unsigned int capacity, int node)
 {
 	struct slab_sheaf *sheaf;
 	size_t sheaf_size;
@@ -2771,7 +2771,7 @@ static struct slab_sheaf *__alloc_empty_sheaf(struct =
kmem_cache *s, gfp_t gfp,
 		gfp |=3D __GFP_NO_OBJ_EXT;
=20
 	sheaf_size =3D struct_size(sheaf, objects, capacity);
-	sheaf =3D kzalloc(sheaf_size, gfp);
+	sheaf =3D kzalloc_node(sheaf_size, gfp, node);
=20
 	if (unlikely(!sheaf))
 		return NULL;
@@ -2791,7 +2791,7 @@ static inline struct slab_sheaf *alloc_empty_sheaf(st=
ruct kmem_cache *s,
=20
 	gfp &=3D ~OBJCGS_CLEAR_MASK;
=20
-	return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity);
+	return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity, numa_mem_id());
 }
=20
 static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *shea=
f)
@@ -5014,7 +5014,7 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t =
gfp, unsigned int size)
=20
 	if (unlikely(size > s->sheaf_capacity)) {
=20
-		sheaf =3D __alloc_empty_sheaf(s, gfp, size);
+		sheaf =3D __alloc_empty_sheaf(s, gfp, size, numa_mem_id());
 		if (!sheaf)
 			return NULL;
=20
@@ -7575,6 +7575,7 @@ static int init_percpu_sheaves(struct kmem_cache *s)
=20
 	for_each_possible_cpu(cpu) {
 		struct slub_percpu_sheaves *pcs;
+		int mem_node;
=20
 		pcs =3D per_cpu_ptr(s->cpu_sheaves, cpu);
=20
@@ -7598,10 +7599,13 @@ static int init_percpu_sheaves(struct kmem_cache *s)
 		 * For kmalloc caches it's used temporarily during the initial
 		 * bootstrap.
 		 */
-		if (!s->sheaf_capacity)
+		if (!s->sheaf_capacity) {
 			pcs->main =3D &bootstrap_sheaf;
-		else
-			pcs->main =3D alloc_empty_sheaf(s, GFP_KERNEL);
+		} else {
+			mem_node =3D local_memory_node(cpu_to_node(cpu));
+			pcs->main =3D __alloc_empty_sheaf(s, GFP_KERNEL,
+					s->sheaf_capacity, mem_node);
+		}
=20
 		if (!pcs->main)
 			return -ENOMEM;
@@ -8465,10 +8469,17 @@ static void __init bootstrap_cache_sheaves(struct k=
mem_cache *s)
=20
 	for_each_possible_cpu(cpu) {
 		struct slub_percpu_sheaves *pcs;
+		int mem_node;
=20
 		pcs =3D per_cpu_ptr(s->cpu_sheaves, cpu);
=20
-		pcs->main =3D __alloc_empty_sheaf(s, GFP_KERNEL, capacity);
+		/*
+		 * Cannot use cpu_to_mem() here because it's only initialized
+		 * for online CPUs at this point (see __build_all_zonelists),
+		 * while we need to allocate sheaves for all possible CPUs.
+		 */
+		mem_node =3D local_memory_node(cpu_to_node(cpu));
+		pcs->main =3D __alloc_empty_sheaf(s, GFP_KERNEL, capacity, mem_node);
=20
 		if (!pcs->main) {
 			failed =3D true;
--=20
2.54.0