From nobody Mon Feb  9 03:32:44 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 74D15CDB465
	for <linux-kernel@archiver.kernel.org>; Thu, 12 Oct 2023 02:49:26 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1376914AbjJLCtZ (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 11 Oct 2023 22:49:25 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56930 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235280AbjJLCsy (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 11 Oct 2023 22:48:54 -0400
Received: from out30-97.freemail.mail.aliyun.com
 (out30-97.freemail.mail.aliyun.com [115.124.30.97])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 94750A4
        for <linux-kernel@vger.kernel.org>;
 Wed, 11 Oct 2023 19:48:51 -0700 (PDT)
X-Alimail-AntiSpam: 
 AC=PASS;BC=-1|-1;BR=01201311R111e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018045168;MF=rongwei.wang@linux.alibaba.com;NM=1;PH=DS;RN=9;SR=0;TI=SMTPD_---0VtykMfs_1697078928;
Received: from localhost.localdomain(mailfrom:rongwei.wang@linux.alibaba.com
 fp:SMTPD_---0VtykMfs_1697078928)
          by smtp.aliyun-inc.com;
          Thu, 12 Oct 2023 10:48:48 +0800
From: Rongwei Wang <rongwei.wang@linux.alibaba.com>
To: linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org,
        linux-mm@kvack.org
Cc: akpm@linux-foundation.org, willy@infradead.org,
        catalin.marinas@arm.com, dave.hansen@linux.intel.com,
        tj@kernel.org, mingo@redhat.com
Subject: [PATCH RFC 1/5] mm/numa: move numa emulation APIs into generic files
Date: Thu, 12 Oct 2023 10:48:38 +0800
Message-Id: <20231012024842.99703-2-rongwei.wang@linux.alibaba.com>
X-Mailer: git-send-email 2.40.0
In-Reply-To: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com>
References: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

In order to support NUMA EMU for other
arch, some functions that used by numa_meminfo
should be moved out x86 arch. mm/numa.c created
to place above API.

CONFIG_NUMA_EMU will be handled later.

Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
---
 arch/x86/include/asm/numa.h |   3 -
 arch/x86/mm/numa.c          | 216 +-------------------------
 arch/x86/mm/numa_internal.h |  14 +-
 include/asm-generic/numa.h  |  18 +++
 mm/Makefile                 |   1 +
 mm/numa.c                   | 298 ++++++++++++++++++++++++++++++++++++
 6 files changed, 323 insertions(+), 227 deletions(-)
 create mode 100644 mm/numa.c

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index e3bae2b60a0d..8d79be8095d5 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -9,9 +9,6 @@
 #include <asm/apicdef.h>
=20
 #ifdef CONFIG_NUMA
-
-#define NR_NODE_MEMBLKS		(MAX_NUMNODES*2)
-
 /*
  * Too small node sizes may confuse the VM badly. Usually they
  * result from BIOS bugs. So dont recognize nodes as standalone
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 2aadb2019b4f..969b11fff03f 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -25,8 +25,8 @@ nodemask_t numa_nodes_parsed __initdata;
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
=20
-static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
-static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
+extern struct numa_meminfo numa_meminfo;
+extern struct numa_meminfo numa_reserved_meminfo;
=20
 static int numa_distance_cnt;
 static u8 *numa_distance;
@@ -148,34 +148,6 @@ static int __init numa_add_memblk_to(int nid, u64 star=
t, u64 end,
 	return 0;
 }
=20
-/**
- * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
- * @idx: Index of memblk to remove
- * @mi: numa_meminfo to remove memblk from
- *
- * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
- * decrementing @mi->nr_blks.
- */
-void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
-{
-	mi->nr_blks--;
-	memmove(&mi->blk[idx], &mi->blk[idx + 1],
-		(mi->nr_blks - idx) * sizeof(mi->blk[0]));
-}
-
-/**
- * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to ano=
ther
- * @dst: numa_meminfo to append block to
- * @idx: Index of memblk to remove
- * @src: numa_meminfo to remove memblk from
- */
-static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
-					 struct numa_meminfo *src)
-{
-	dst->blk[dst->nr_blks++] =3D src->blk[idx];
-	numa_remove_memblk_from(idx, src);
-}
-
 /**
  * numa_add_memblk - Add one numa_memblk to numa_meminfo
  * @nid: NUMA node ID of the new memblk
@@ -225,124 +197,6 @@ static void __init alloc_node_data(int nid)
 	node_set_online(nid);
 }
=20
-/**
- * numa_cleanup_meminfo - Cleanup a numa_meminfo
- * @mi: numa_meminfo to clean up
- *
- * Sanitize @mi by merging and removing unnecessary memblks.  Also check f=
or
- * conflicts and clear unused memblks.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
-{
-	const u64 low =3D 0;
-	const u64 high =3D PFN_PHYS(max_pfn);
-	int i, j, k;
-
-	/* first, trim all entries */
-	for (i =3D 0; i < mi->nr_blks; i++) {
-		struct numa_memblk *bi =3D &mi->blk[i];
-
-		/* move / save reserved memory ranges */
-		if (!memblock_overlaps_region(&memblock.memory,
-					bi->start, bi->end - bi->start)) {
-			numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
-			continue;
-		}
-
-		/* make sure all non-reserved blocks are inside the limits */
-		bi->start =3D max(bi->start, low);
-
-		/* preserve info for non-RAM areas above 'max_pfn': */
-		if (bi->end > high) {
-			numa_add_memblk_to(bi->nid, high, bi->end,
-					   &numa_reserved_meminfo);
-			bi->end =3D high;
-		}
-
-		/* and there's no empty block */
-		if (bi->start >=3D bi->end)
-			numa_remove_memblk_from(i--, mi);
-	}
-
-	/* merge neighboring / overlapping entries */
-	for (i =3D 0; i < mi->nr_blks; i++) {
-		struct numa_memblk *bi =3D &mi->blk[i];
-
-		for (j =3D i + 1; j < mi->nr_blks; j++) {
-			struct numa_memblk *bj =3D &mi->blk[j];
-			u64 start, end;
-
-			/*
-			 * See whether there are overlapping blocks.  Whine
-			 * about but allow overlaps of the same nid.  They
-			 * will be merged below.
-			 */
-			if (bi->end > bj->start && bi->start < bj->end) {
-				if (bi->nid !=3D bj->nid) {
-					pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#01=
0Lx-%#010Lx]\n",
-					       bi->nid, bi->start, bi->end - 1,
-					       bj->nid, bj->start, bj->end - 1);
-					return -EINVAL;
-				}
-				pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [=
mem %#010Lx-%#010Lx]\n",
-					bi->nid, bi->start, bi->end - 1,
-					bj->start, bj->end - 1);
-			}
-
-			/*
-			 * Join together blocks on the same node, holes
-			 * between which don't overlap with memory on other
-			 * nodes.
-			 */
-			if (bi->nid !=3D bj->nid)
-				continue;
-			start =3D min(bi->start, bj->start);
-			end =3D max(bi->end, bj->end);
-			for (k =3D 0; k < mi->nr_blks; k++) {
-				struct numa_memblk *bk =3D &mi->blk[k];
-
-				if (bi->nid =3D=3D bk->nid)
-					continue;
-				if (start < bk->end && end > bk->start)
-					break;
-			}
-			if (k < mi->nr_blks)
-				continue;
-			printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#=
010Lx] -> [mem %#010Lx-%#010Lx]\n",
-			       bi->nid, bi->start, bi->end - 1, bj->start,
-			       bj->end - 1, start, end - 1);
-			bi->start =3D start;
-			bi->end =3D end;
-			numa_remove_memblk_from(j--, mi);
-		}
-	}
-
-	/* clear unused ones */
-	for (i =3D mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
-		mi->blk[i].start =3D mi->blk[i].end =3D 0;
-		mi->blk[i].nid =3D NUMA_NO_NODE;
-	}
-
-	return 0;
-}
-
-/*
- * Set nodes, which have memory in @mi, in *@nodemask.
- */
-static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
-					      const struct numa_meminfo *mi)
-{
-	int i;
-
-	for (i =3D 0; i < ARRAY_SIZE(mi->blk); i++)
-		if (mi->blk[i].start !=3D mi->blk[i].end &&
-		    mi->blk[i].nid !=3D NUMA_NO_NODE)
-			node_set(mi->blk[i].nid, *nodemask);
-}
-
 /**
  * numa_reset_distance - Reset NUMA distance table
  *
@@ -478,72 +332,6 @@ static bool __init numa_meminfo_cover_memory(const str=
uct numa_meminfo *mi)
 	return true;
 }
=20
-/*
- * Mark all currently memblock-reserved physical memory (which covers the
- * kernel's own memory ranges) as hot-unswappable.
- */
-static void __init numa_clear_kernel_node_hotplug(void)
-{
-	nodemask_t reserved_nodemask =3D NODE_MASK_NONE;
-	struct memblock_region *mb_region;
-	int i;
-
-	/*
-	 * We have to do some preprocessing of memblock regions, to
-	 * make them suitable for reservation.
-	 *
-	 * At this time, all memory regions reserved by memblock are
-	 * used by the kernel, but those regions are not split up
-	 * along node boundaries yet, and don't necessarily have their
-	 * node ID set yet either.
-	 *
-	 * So iterate over all memory known to the x86 architecture,
-	 * and use those ranges to set the nid in memblock.reserved.
-	 * This will split up the memblock regions along node
-	 * boundaries and will set the node IDs as well.
-	 */
-	for (i =3D 0; i < numa_meminfo.nr_blks; i++) {
-		struct numa_memblk *mb =3D numa_meminfo.blk + i;
-		int ret;
-
-		ret =3D memblock_set_node(mb->start, mb->end - mb->start, &memblock.rese=
rved, mb->nid);
-		WARN_ON_ONCE(ret);
-	}
-
-	/*
-	 * Now go over all reserved memblock regions, to construct a
-	 * node mask of all kernel reserved memory areas.
-	 *
-	 * [ Note, when booting with mem=3Dnn[kMG] or in a kdump kernel,
-	 *   numa_meminfo might not include all memblock.reserved
-	 *   memory ranges, because quirks such as trim_snb_memory()
-	 *   reserve specific pages for Sandy Bridge graphics. ]
-	 */
-	for_each_reserved_mem_region(mb_region) {
-		int nid =3D memblock_get_region_node(mb_region);
-
-		if (nid !=3D MAX_NUMNODES)
-			node_set(nid, reserved_nodemask);
-	}
-
-	/*
-	 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
-	 * belonging to the reserved node mask.
-	 *
-	 * Note that this will include memory regions that reside
-	 * on nodes that contain kernel memory - entire nodes
-	 * become hot-unpluggable:
-	 */
-	for (i =3D 0; i < numa_meminfo.nr_blks; i++) {
-		struct numa_memblk *mb =3D numa_meminfo.blk + i;
-
-		if (!node_isset(mb->nid, reserved_nodemask))
-			continue;
-
-		memblock_clear_hotplug(mb->start, mb->end - mb->start);
-	}
-}
-
 static int __init numa_register_memblks(struct numa_meminfo *mi)
 {
 	int i, nid;
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index 86860f279662..b6053beb81b1 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -16,19 +16,13 @@ struct numa_meminfo {
 	struct numa_memblk	blk[NR_NODE_MEMBLKS];
 };
=20
-void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
-int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
+extern int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
 void __init numa_reset_distance(void);
=20
 void __init x86_numa_init(void);
=20
-#ifdef CONFIG_NUMA_EMU
-void __init numa_emulation(struct numa_meminfo *numa_meminfo,
-			   int numa_dist_cnt);
-#else
-static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
-				  int numa_dist_cnt)
-{ }
-#endif
+extern void __init numa_emulation(struct numa_meminfo *numa_meminfo,
+				  int numa_dist_cnt);
+
=20
 #endif	/* __X86_MM_NUMA_INTERNAL_H */
diff --git a/include/asm-generic/numa.h b/include/asm-generic/numa.h
index 1a3ad6d29833..929d7c582a73 100644
--- a/include/asm-generic/numa.h
+++ b/include/asm-generic/numa.h
@@ -39,6 +39,24 @@ void numa_store_cpu_info(unsigned int cpu);
 void numa_add_cpu(unsigned int cpu);
 void numa_remove_cpu(unsigned int cpu);
=20
+struct numa_memblk {
+	u64			start;
+	u64			end;
+	int			nid;
+};
+
+struct numa_meminfo {
+	int			nr_blks;
+	struct numa_memblk	blk[NR_NODE_MEMBLKS];
+};
+
+extern struct numa_meminfo numa_meminfo;
+
+int __init numa_register_memblks(struct numa_meminfo *mi);
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
+void __init numa_emulation(struct numa_meminfo *numa_meminfo,
+			   int numa_dist_cnt);
+
 #else	/* CONFIG_NUMA */
=20
 static inline void numa_store_cpu_info(unsigned int cpu) { }
diff --git a/mm/Makefile b/mm/Makefile
index ec65984e2ade..6fc1bd7c9f5b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) +=3D io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) +=3D bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) +=3D ioremap.o
 obj-$(CONFIG_SHRINKER_DEBUG) +=3D shrinker_debug.o
+obj-$(CONFIG_NUMA) +=3D numa.o
diff --git a/mm/numa.c b/mm/numa.c
new file mode 100644
index 000000000000..88277e8404f0
--- /dev/null
+++ b/mm/numa.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/acpi.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/ctype.h>
+#include <linux/nodemask.h>
+#include <linux/sched.h>
+#include <linux/topology.h>
+
+#include <asm/dma.h>
+
+struct numa_meminfo numa_meminfo __initdata_or_meminfo;
+struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
+
+/*
+ * Set nodes, which have memory in @mi, in *@nodemask.
+ */
+void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+					      const struct numa_meminfo *mi)
+{
+	int i;
+
+	for (i =3D 0; i < ARRAY_SIZE(mi->blk); i++)
+		if (mi->blk[i].start !=3D mi->blk[i].end &&
+		    mi->blk[i].nid !=3D NUMA_NO_NODE)
+			node_set(mi->blk[i].nid, *nodemask);
+}
+
+/**
+ * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
+ * @idx: Index of memblk to remove
+ * @mi: numa_meminfo to remove memblk from
+ *
+ * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
+ * decrementing @mi->nr_blks.
+ */
+static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *m=
i)
+{
+	mi->nr_blks--;
+	memmove(&mi->blk[idx], &mi->blk[idx + 1],
+		(mi->nr_blks - idx) * sizeof(mi->blk[0]));
+}
+
+/**
+ * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to ano=
ther
+ * @dst: numa_meminfo to append block to
+ * @idx: Index of memblk to remove
+ * @src: numa_meminfo to remove memblk from
+ */
+static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
+					 struct numa_meminfo *src)
+{
+	dst->blk[dst->nr_blks++] =3D src->blk[idx];
+	numa_remove_memblk_from(idx, src);
+}
+
+int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+				     struct numa_meminfo *mi)
+{
+	/* ignore zero length blks */
+	if (start =3D=3D end)
+		return 0;
+
+	/* whine about and ignore invalid blks */
+	if (start > end || nid < 0 || nid >=3D MAX_NUMNODES) {
+		pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+			nid, start, end - 1);
+		return 0;
+	}
+
+	if (mi->nr_blks >=3D NR_NODE_MEMBLKS) {
+		pr_err("too many memblk ranges\n");
+		return -EINVAL;
+	}
+
+	mi->blk[mi->nr_blks].start =3D start;
+	mi->blk[mi->nr_blks].end =3D end;
+	mi->blk[mi->nr_blks].nid =3D nid;
+	mi->nr_blks++;
+	return 0;
+}
+
+/**
+ * numa_cleanup_meminfo - Cleanup a numa_meminfo
+ * @mi: numa_meminfo to clean up
+ *
+ * Sanitize @mi by merging and removing unnecessary memblks.  Also check f=
or
+ * conflicts and clear unused memblks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
+{
+	const u64 low =3D 0;
+	const u64 high =3D PFN_PHYS(max_pfn);
+	int i, j, k;
+
+	/* first, trim all entries */
+	for (i =3D 0; i < mi->nr_blks; i++) {
+		struct numa_memblk *bi =3D &mi->blk[i];
+
+		/* move / save reserved memory ranges */
+		if (!memblock_overlaps_region(&memblock.memory,
+					bi->start, bi->end - bi->start)) {
+			numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
+			continue;
+		}
+
+		/* make sure all non-reserved blocks are inside the limits */
+		bi->start =3D max(bi->start, low);
+
+		/* preserve info for non-RAM areas above 'max_pfn': */
+		if (bi->end > high) {
+			numa_add_memblk_to(bi->nid, high, bi->end,
+					   &numa_reserved_meminfo);
+			bi->end =3D high;
+		}
+
+		/* and there's no empty block */
+		if (bi->start >=3D bi->end)
+			numa_remove_memblk_from(i--, mi);
+	}
+
+	/* merge neighboring / overlapping entries */
+	for (i =3D 0; i < mi->nr_blks; i++) {
+		struct numa_memblk *bi =3D &mi->blk[i];
+
+		for (j =3D i + 1; j < mi->nr_blks; j++) {
+			struct numa_memblk *bj =3D &mi->blk[j];
+			u64 start, end;
+
+			/*
+			 * See whether there are overlapping blocks.  Whine
+			 * about but allow overlaps of the same nid.  They
+			 * will be merged below.
+			 */
+			if (bi->end > bj->start && bi->start < bj->end) {
+				if (bi->nid !=3D bj->nid) {
+					pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#01=
0Lx-%#010Lx]\n",
+					       bi->nid, bi->start, bi->end - 1,
+					       bj->nid, bj->start, bj->end - 1);
+					return -EINVAL;
+				}
+				pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [=
mem %#010Lx-%#010Lx]\n",
+					bi->nid, bi->start, bi->end - 1,
+					bj->start, bj->end - 1);
+			}
+
+			/*
+			 * Join together blocks on the same node, holes
+			 * between which don't overlap with memory on other
+			 * nodes.
+			 */
+			if (bi->nid !=3D bj->nid)
+				continue;
+			start =3D min(bi->start, bj->start);
+			end =3D max(bi->end, bj->end);
+			for (k =3D 0; k < mi->nr_blks; k++) {
+				struct numa_memblk *bk =3D &mi->blk[k];
+
+				if (bi->nid =3D=3D bk->nid)
+					continue;
+				if (start < bk->end && end > bk->start)
+					break;
+			}
+			if (k < mi->nr_blks)
+				continue;
+			printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#=
010Lx] -> [mem %#010Lx-%#010Lx]\n",
+			       bi->nid, bi->start, bi->end - 1, bj->start,
+			       bj->end - 1, start, end - 1);
+			bi->start =3D start;
+			bi->end =3D end;
+			numa_remove_memblk_from(j--, mi);
+		}
+	}
+
+	/* clear unused ones */
+	for (i =3D mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
+		mi->blk[i].start =3D mi->blk[i].end =3D 0;
+		mi->blk[i].nid =3D NUMA_NO_NODE;
+	}
+
+	return 0;
+}
+
+/*
+ * Mark all currently memblock-reserved physical memory (which covers the
+ * kernel's own memory ranges) as hot-unswappable.
+ */
+static void __init numa_clear_kernel_node_hotplug(void)
+{
+	nodemask_t reserved_nodemask =3D NODE_MASK_NONE;
+	struct memblock_region *mb_region;
+	int i;
+
+	/*
+	 * We have to do some preprocessing of memblock regions, to
+	 * make them suitable for reservation.
+	 *
+	 * At this time, all memory regions reserved by memblock are
+	 * used by the kernel, but those regions are not split up
+	 * along node boundaries yet, and don't necessarily have their
+	 * node ID set yet either.
+	 *
+	 * So iterate over all memory known to the x86 architecture,
+	 * and use those ranges to set the nid in memblock.reserved.
+	 * This will split up the memblock regions along node
+	 * boundaries and will set the node IDs as well.
+	 */
+	for (i =3D 0; i < numa_meminfo.nr_blks; i++) {
+		struct numa_memblk *mb =3D numa_meminfo.blk + i;
+		int ret;
+
+		ret =3D memblock_set_node(mb->start, mb->end - mb->start, &memblock.rese=
rved, mb->nid);
+		WARN_ON_ONCE(ret);
+	}
+
+	/*
+	 * Now go over all reserved memblock regions, to construct a
+	 * node mask of all kernel reserved memory areas.
+	 *
+	 * [ Note, when booting with mem=3Dnn[kMG] or in a kdump kernel,
+	 *   numa_meminfo might not include all memblock.reserved
+	 *   memory ranges, because quirks such as trim_snb_memory()
+	 *   reserve specific pages for Sandy Bridge graphics. ]
+	 */
+	for_each_reserved_mem_region(mb_region) {
+		int nid =3D memblock_get_region_node(mb_region);
+
+		if (nid !=3D MAX_NUMNODES)
+			node_set(nid, reserved_nodemask);
+	}
+
+	/*
+	 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
+	 * belonging to the reserved node mask.
+	 *
+	 * Note that this will include memory regions that reside
+	 * on nodes that contain kernel memory - entire nodes
+	 * become hot-unpluggable:
+	 */
+	for (i =3D 0; i < numa_meminfo.nr_blks; i++) {
+		struct numa_memblk *mb =3D numa_meminfo.blk + i;
+
+		if (!node_isset(mb->nid, reserved_nodemask))
+			continue;
+
+		memblock_clear_hotplug(mb->start, mb->end - mb->start);
+	}
+}
+
+int __weak __init numa_register_memblks(struct numa_meminfo *mi)
+{
+	int i;
+
+	/* Account for nodes with cpus and no memory */
+	node_possible_map =3D numa_nodes_parsed;
+	numa_nodemask_from_meminfo(&node_possible_map, mi);
+	if (WARN_ON(nodes_empty(node_possible_map)))
+		return -EINVAL;
+
+	for (i =3D 0; i < mi->nr_blks; i++) {
+		struct numa_memblk *mb =3D &mi->blk[i];
+		memblock_set_node(mb->start, mb->end - mb->start,
+				  &memblock.memory, mb->nid);
+	}
+
+	/*
+	 * At very early time, the kernel have to use some memory such as
+	 * loading the kernel image. We cannot prevent this anyway. So any
+	 * node the kernel resides in should be un-hotpluggable.
+	 *
+	 * And when we come here, alloc node data won't fail.
+	 */
+	numa_clear_kernel_node_hotplug();
+
+	/*
+	 * If sections array is gonna be used for pfn -> nid mapping, check
+	 * whether its granularity is fine enough.
+	 */
+	if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
+		unsigned long pfn_align =3D node_map_pfn_alignment();
+
+		if (pfn_align && pfn_align < PAGES_PER_SECTION) {
+			pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
+				PFN_PHYS(pfn_align) >> 20,
+				PFN_PHYS(PAGES_PER_SECTION) >> 20);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
--=20
2.32.0.3.gf3a3e56d6
From nobody Mon Feb  9 03:32:44 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 16D4BCDB465
	for <linux-kernel@archiver.kernel.org>; Thu, 12 Oct 2023 02:49:24 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S235315AbjJLCtX (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 11 Oct 2023 22:49:23 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56916 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235276AbjJLCsy (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 11 Oct 2023 22:48:54 -0400
Received: from out30-124.freemail.mail.aliyun.com
 (out30-124.freemail.mail.aliyun.com [115.124.30.124])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 96508A9
        for <linux-kernel@vger.kernel.org>;
 Wed, 11 Oct 2023 19:48:52 -0700 (PDT)
X-Alimail-AntiSpam: 
 AC=PASS;BC=-1|-1;BR=01201311R131e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018045192;MF=rongwei.wang@linux.alibaba.com;NM=1;PH=DS;RN=9;SR=0;TI=SMTPD_---0VtykMgL_1697078929;
Received: from localhost.localdomain(mailfrom:rongwei.wang@linux.alibaba.com
 fp:SMTPD_---0VtykMgL_1697078929)
          by smtp.aliyun-inc.com;
          Thu, 12 Oct 2023 10:48:49 +0800
From: Rongwei Wang <rongwei.wang@linux.alibaba.com>
To: linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org,
        linux-mm@kvack.org
Cc: akpm@linux-foundation.org, willy@infradead.org,
        catalin.marinas@arm.com, dave.hansen@linux.intel.com,
        tj@kernel.org, mingo@redhat.com
Subject: [PATCH RFC 2/5] mm: percpu: fix variable type of cpu
Date: Thu, 12 Oct 2023 10:48:39 +0800
Message-Id: <20231012024842.99703-3-rongwei.wang@linux.alibaba.com>
X-Mailer: git-send-email 2.40.0
In-Reply-To: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com>
References: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

Almost all places declare 'cpu' as 'unsigned int'
type, but early_cpu_to_nod() not. So correct it
in this patch.

Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
---
 drivers/base/arch_numa.c | 2 +-
 include/linux/percpu.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index eaa31e567d1e..db0bb8b8fd67 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -144,7 +144,7 @@ void __init early_map_cpu_to_node(unsigned int cpu, int=
 nid)
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
=20
-static int __init early_cpu_to_node(int cpu)
+static int __init early_cpu_to_node(unsigned int cpu)
 {
 	return cpu_to_node_map[cpu];
 }
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 68fac2e7cbe6..4aee8400af54 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -100,7 +100,7 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR];
=20
 extern enum pcpu_fc pcpu_chosen_fc;
=20
-typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu);
+typedef int (pcpu_fc_cpu_to_node_fn_t)(unsigned int cpu);
 typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to=
);
=20
 extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
--=20
2.32.0.3.gf3a3e56d6
From nobody Mon Feb  9 03:32:44 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id B31FFCDB465
	for <linux-kernel@archiver.kernel.org>; Thu, 12 Oct 2023 02:49:29 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1376938AbjJLCt2 (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 11 Oct 2023 22:49:28 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56932 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1376775AbjJLCsz (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 11 Oct 2023 22:48:55 -0400
Received: from out30-113.freemail.mail.aliyun.com
 (out30-113.freemail.mail.aliyun.com [115.124.30.113])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 42F78B6
        for <linux-kernel@vger.kernel.org>;
 Wed, 11 Oct 2023 19:48:53 -0700 (PDT)
X-Alimail-AntiSpam: 
 AC=PASS;BC=-1|-1;BR=01201311R561e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018046049;MF=rongwei.wang@linux.alibaba.com;NM=1;PH=DS;RN=9;SR=0;TI=SMTPD_---0VtykMgi_1697078930;
Received: from localhost.localdomain(mailfrom:rongwei.wang@linux.alibaba.com
 fp:SMTPD_---0VtykMgi_1697078930)
          by smtp.aliyun-inc.com;
          Thu, 12 Oct 2023 10:48:51 +0800
From: Rongwei Wang <rongwei.wang@linux.alibaba.com>
To: linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org,
        linux-mm@kvack.org
Cc: akpm@linux-foundation.org, willy@infradead.org,
        catalin.marinas@arm.com, dave.hansen@linux.intel.com,
        tj@kernel.org, mingo@redhat.com
Subject: [PATCH RFC 3/5] arch_numa: remove __init in early_cpu_to_node()
Date: Thu, 12 Oct 2023 10:48:40 +0800
Message-Id: <20231012024842.99703-4-rongwei.wang@linux.alibaba.com>
X-Mailer: git-send-email 2.40.0
In-Reply-To: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com>
References: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

Most of arch does not stick '__init' for
early_cpu_to_node(). And it's safe to
delete this attribute here, ready for
later numa emulation.

Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
---
 drivers/base/arch_numa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index db0bb8b8fd67..5df0ad5cb09d 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -144,7 +144,7 @@ void __init early_map_cpu_to_node(unsigned int cpu, int=
 nid)
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
=20
-static int __init early_cpu_to_node(unsigned int cpu)
+int early_cpu_to_node(unsigned int cpu)
 {
 	return cpu_to_node_map[cpu];
 }
--=20
2.32.0.3.gf3a3e56d6
From nobody Mon Feb  9 03:32:44 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id E00DECDB465
	for <linux-kernel@archiver.kernel.org>; Thu, 12 Oct 2023 02:49:32 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1376964AbjJLCtb (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 11 Oct 2023 22:49:31 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41070 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1376803AbjJLCs5 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 11 Oct 2023 22:48:57 -0400
Received: from out30-113.freemail.mail.aliyun.com
 (out30-113.freemail.mail.aliyun.com [115.124.30.113])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 48BFCA9
        for <linux-kernel@vger.kernel.org>;
 Wed, 11 Oct 2023 19:48:54 -0700 (PDT)
X-Alimail-AntiSpam: 
 AC=PASS;BC=-1|-1;BR=01201311R111e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018046060;MF=rongwei.wang@linux.alibaba.com;NM=1;PH=DS;RN=9;SR=0;TI=SMTPD_---0VtykMh0_1697078931;
Received: from localhost.localdomain(mailfrom:rongwei.wang@linux.alibaba.com
 fp:SMTPD_---0VtykMh0_1697078931)
          by smtp.aliyun-inc.com;
          Thu, 12 Oct 2023 10:48:52 +0800
From: Rongwei Wang <rongwei.wang@linux.alibaba.com>
To: linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org,
        linux-mm@kvack.org
Cc: akpm@linux-foundation.org, willy@infradead.org,
        catalin.marinas@arm.com, dave.hansen@linux.intel.com,
        tj@kernel.org, mingo@redhat.com
Subject: [PATCH RFC 4/5] mm/numa: support CONFIG_NUMA_EMU for arm64
Date: Thu, 12 Oct 2023 10:48:41 +0800
Message-Id: <20231012024842.99703-5-rongwei.wang@linux.alibaba.com>
X-Mailer: git-send-email 2.40.0
In-Reply-To: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com>
References: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

The CONFIG_NUMA_EMU migrates from x86/Kconfig
to mm/Kconfig. Now x86 and arm64 support it.

Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
---
 arch/x86/Kconfig             |   8 -
 arch/x86/mm/Makefile         |   1 -
 arch/x86/mm/numa_emulation.c | 585 -----------------------------------
 drivers/base/arch_numa.c     |   3 +
 include/asm-generic/numa.h   |  12 +
 mm/Kconfig                   |   8 +
 mm/numa.c                    |  12 +
 7 files changed, 35 insertions(+), 594 deletions(-)
 delete mode 100644 arch/x86/mm/numa_emulation.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 66bfabae8814..13438bfe2ec1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1568,14 +1568,6 @@ config X86_64_ACPI_NUMA
 	help
 	  Enable ACPI SRAT based node topology detection.
=20
-config NUMA_EMU
-	bool "NUMA emulation"
-	depends on NUMA
-	help
-	  Enable NUMA emulation. A flat machine will be split
-	  into virtual nodes when booted with "numa=3Dfake=3DN", where N is the
-	  number of nodes. This is only useful for debugging.
-
 config NODES_SHIFT
 	int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
 	range 1 10
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index c80febc44cd2..1581f17e5de4 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -56,7 +56,6 @@ obj-$(CONFIG_MMIOTRACE_TEST)	+=3D testmmiotrace.o
 obj-$(CONFIG_NUMA)		+=3D numa.o numa_$(BITS).o
 obj-$(CONFIG_AMD_NUMA)		+=3D amdtopology.o
 obj-$(CONFIG_ACPI_NUMA)		+=3D srat.o
-obj-$(CONFIG_NUMA_EMU)		+=3D numa_emulation.o
=20
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+=3D pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY)			+=3D kaslr.o
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
deleted file mode 100644
index 9a9305367fdd..000000000000
--- a/arch/x86/mm/numa_emulation.c
+++ /dev/null
@@ -1,585 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NUMA emulation
- */
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/topology.h>
-#include <linux/memblock.h>
-#include <asm/dma.h>
-
-#include "numa_internal.h"
-
-static int emu_nid_to_phys[MAX_NUMNODES];
-static char *emu_cmdline __initdata;
-
-int __init numa_emu_cmdline(char *str)
-{
-	emu_cmdline =3D str;
-	return 0;
-}
-
-static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminf=
o *mi)
-{
-	int i;
-
-	for (i =3D 0; i < mi->nr_blks; i++)
-		if (mi->blk[i].nid =3D=3D nid)
-			return i;
-	return -ENOENT;
-}
-
-static u64 __init mem_hole_size(u64 start, u64 end)
-{
-	unsigned long start_pfn =3D PFN_UP(start);
-	unsigned long end_pfn =3D PFN_DOWN(end);
-
-	if (start_pfn < end_pfn)
-		return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
-	return 0;
-}
-
-/*
- * Sets up nid to range from @start to @end.  The return value is -errno if
- * something went wrong, 0 otherwise.
- */
-static int __init emu_setup_memblk(struct numa_meminfo *ei,
-				   struct numa_meminfo *pi,
-				   int nid, int phys_blk, u64 size)
-{
-	struct numa_memblk *eb =3D &ei->blk[ei->nr_blks];
-	struct numa_memblk *pb =3D &pi->blk[phys_blk];
-
-	if (ei->nr_blks >=3D NR_NODE_MEMBLKS) {
-		pr_err("NUMA: Too many emulated memblks, failing emulation\n");
-		return -EINVAL;
-	}
-
-	ei->nr_blks++;
-	eb->start =3D pb->start;
-	eb->end =3D pb->start + size;
-	eb->nid =3D nid;
-
-	if (emu_nid_to_phys[nid] =3D=3D NUMA_NO_NODE)
-		emu_nid_to_phys[nid] =3D pb->nid;
-
-	pb->start +=3D size;
-	if (pb->start >=3D pb->end) {
-		WARN_ON_ONCE(pb->start > pb->end);
-		numa_remove_memblk_from(phys_blk, pi);
-	}
-
-	printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
-	       nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
-	return 0;
-}
-
-/*
- * Sets up nr_nodes fake nodes interleaved over physical nodes ranging fro=
m addr
- * to max_addr.
- *
- * Returns zero on success or negative on error.
- */
-static int __init split_nodes_interleave(struct numa_meminfo *ei,
-					 struct numa_meminfo *pi,
-					 u64 addr, u64 max_addr, int nr_nodes)
-{
-	nodemask_t physnode_mask =3D numa_nodes_parsed;
-	u64 size;
-	int big;
-	int nid =3D 0;
-	int i, ret;
-
-	if (nr_nodes <=3D 0)
-		return -1;
-	if (nr_nodes > MAX_NUMNODES) {
-		pr_info("numa=3Dfake=3D%d too large, reducing to %d\n",
-			nr_nodes, MAX_NUMNODES);
-		nr_nodes =3D MAX_NUMNODES;
-	}
-
-	/*
-	 * Calculate target node size.  x86_32 freaks on __udivdi3() so do
-	 * the division in ulong number of pages and convert back.
-	 */
-	size =3D max_addr - addr - mem_hole_size(addr, max_addr);
-	size =3D PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
-
-	/*
-	 * Calculate the number of big nodes that can be allocated as a result
-	 * of consolidating the remainder.
-	 */
-	big =3D ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
-		FAKE_NODE_MIN_SIZE;
-
-	size &=3D FAKE_NODE_MIN_HASH_MASK;
-	if (!size) {
-		pr_err("Not enough memory for each node.  "
-			"NUMA emulation disabled.\n");
-		return -1;
-	}
-
-	/*
-	 * Continue to fill physical nodes with fake nodes until there is no
-	 * memory left on any of them.
-	 */
-	while (!nodes_empty(physnode_mask)) {
-		for_each_node_mask(i, physnode_mask) {
-			u64 dma32_end =3D PFN_PHYS(MAX_DMA32_PFN);
-			u64 start, limit, end;
-			int phys_blk;
-
-			phys_blk =3D emu_find_memblk_by_nid(i, pi);
-			if (phys_blk < 0) {
-				node_clear(i, physnode_mask);
-				continue;
-			}
-			start =3D pi->blk[phys_blk].start;
-			limit =3D pi->blk[phys_blk].end;
-			end =3D start + size;
-
-			if (nid < big)
-				end +=3D FAKE_NODE_MIN_SIZE;
-
-			/*
-			 * Continue to add memory to this fake node if its
-			 * non-reserved memory is less than the per-node size.
-			 */
-			while (end - start - mem_hole_size(start, end) < size) {
-				end +=3D FAKE_NODE_MIN_SIZE;
-				if (end > limit) {
-					end =3D limit;
-					break;
-				}
-			}
-
-			/*
-			 * If there won't be at least FAKE_NODE_MIN_SIZE of
-			 * non-reserved memory in ZONE_DMA32 for the next node,
-			 * this one must extend to the boundary.
-			 */
-			if (end < dma32_end && dma32_end - end -
-			    mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
-				end =3D dma32_end;
-
-			/*
-			 * If there won't be enough non-reserved memory for the
-			 * next node, this one must extend to the end of the
-			 * physical node.
-			 */
-			if (limit - end - mem_hole_size(end, limit) < size)
-				end =3D limit;
-
-			ret =3D emu_setup_memblk(ei, pi, nid++ % nr_nodes,
-					       phys_blk,
-					       min(end, limit) - start);
-			if (ret < 0)
-				return ret;
-		}
-	}
-	return 0;
-}
-
-/*
- * Returns the end address of a node so that there is at least `size' amou=
nt of
- * non-reserved memory or `max_addr' is reached.
- */
-static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
-{
-	u64 end =3D start + size;
-
-	while (end - start - mem_hole_size(start, end) < size) {
-		end +=3D FAKE_NODE_MIN_SIZE;
-		if (end > max_addr) {
-			end =3D max_addr;
-			break;
-		}
-	}
-	return end;
-}
-
-static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
-{
-	unsigned long max_pfn =3D PHYS_PFN(max_addr);
-	unsigned long base_pfn =3D PHYS_PFN(base);
-	unsigned long hole_pfns =3D PHYS_PFN(hole);
-
-	return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
-}
-
-/*
- * Sets up fake nodes of `size' interleaved over physical nodes ranging fr=
om
- * `addr' to `max_addr'.
- *
- * Returns zero on success or negative on error.
- */
-static int __init split_nodes_size_interleave_uniform(struct numa_meminfo =
*ei,
-					      struct numa_meminfo *pi,
-					      u64 addr, u64 max_addr, u64 size,
-					      int nr_nodes, struct numa_memblk *pblk,
-					      int nid)
-{
-	nodemask_t physnode_mask =3D numa_nodes_parsed;
-	int i, ret, uniform =3D 0;
-	u64 min_size;
-
-	if ((!size && !nr_nodes) || (nr_nodes && !pblk))
-		return -1;
-
-	/*
-	 * In the 'uniform' case split the passed in physical node by
-	 * nr_nodes, in the non-uniform case, ignore the passed in
-	 * physical block and try to create nodes of at least size
-	 * @size.
-	 *
-	 * In the uniform case, split the nodes strictly by physical
-	 * capacity, i.e. ignore holes. In the non-uniform case account
-	 * for holes and treat @size as a minimum floor.
-	 */
-	if (!nr_nodes)
-		nr_nodes =3D MAX_NUMNODES;
-	else {
-		nodes_clear(physnode_mask);
-		node_set(pblk->nid, physnode_mask);
-		uniform =3D 1;
-	}
-
-	if (uniform) {
-		min_size =3D uniform_size(max_addr, addr, 0, nr_nodes);
-		size =3D min_size;
-	} else {
-		/*
-		 * The limit on emulated nodes is MAX_NUMNODES, so the
-		 * size per node is increased accordingly if the
-		 * requested size is too small.  This creates a uniform
-		 * distribution of node sizes across the entire machine
-		 * (but not necessarily over physical nodes).
-		 */
-		min_size =3D uniform_size(max_addr, addr,
-				mem_hole_size(addr, max_addr), nr_nodes);
-	}
-	min_size =3D ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
-	if (size < min_size) {
-		pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
-			size >> 20, min_size >> 20);
-		size =3D min_size;
-	}
-	size =3D ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
-
-	/*
-	 * Fill physical nodes with fake nodes of size until there is no memory
-	 * left on any of them.
-	 */
-	while (!nodes_empty(physnode_mask)) {
-		for_each_node_mask(i, physnode_mask) {
-			u64 dma32_end =3D PFN_PHYS(MAX_DMA32_PFN);
-			u64 start, limit, end;
-			int phys_blk;
-
-			phys_blk =3D emu_find_memblk_by_nid(i, pi);
-			if (phys_blk < 0) {
-				node_clear(i, physnode_mask);
-				continue;
-			}
-
-			start =3D pi->blk[phys_blk].start;
-			limit =3D pi->blk[phys_blk].end;
-
-			if (uniform)
-				end =3D start + size;
-			else
-				end =3D find_end_of_node(start, limit, size);
-			/*
-			 * If there won't be at least FAKE_NODE_MIN_SIZE of
-			 * non-reserved memory in ZONE_DMA32 for the next node,
-			 * this one must extend to the boundary.
-			 */
-			if (end < dma32_end && dma32_end - end -
-			    mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
-				end =3D dma32_end;
-
-			/*
-			 * If there won't be enough non-reserved memory for the
-			 * next node, this one must extend to the end of the
-			 * physical node.
-			 */
-			if ((limit - end - mem_hole_size(end, limit) < size)
-					&& !uniform)
-				end =3D limit;
-
-			ret =3D emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
-					       phys_blk,
-					       min(end, limit) - start);
-			if (ret < 0)
-				return ret;
-		}
-	}
-	return nid;
-}
-
-static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
-					      struct numa_meminfo *pi,
-					      u64 addr, u64 max_addr, u64 size)
-{
-	return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
-			0, NULL, 0);
-}
-
-static int __init setup_emu2phys_nid(int *dfl_phys_nid)
-{
-	int i, max_emu_nid =3D 0;
-
-	*dfl_phys_nid =3D NUMA_NO_NODE;
-	for (i =3D 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
-		if (emu_nid_to_phys[i] !=3D NUMA_NO_NODE) {
-			max_emu_nid =3D i;
-			if (*dfl_phys_nid =3D=3D NUMA_NO_NODE)
-				*dfl_phys_nid =3D emu_nid_to_phys[i];
-		}
-	}
-
-	return max_emu_nid;
-}
-
-/**
- * numa_emulation - Emulate NUMA nodes
- * @numa_meminfo: NUMA configuration to massage
- * @numa_dist_cnt: The size of the physical NUMA distance table
- *
- * Emulate NUMA nodes according to the numa=3Dfake kernel parameter.
- * @numa_meminfo contains the physical memory configuration and is modified
- * to reflect the emulated configuration on success.  @numa_dist_cnt is
- * used to determine the size of the physical distance table.
- *
- * On success, the following modifications are made.
- *
- * - @numa_meminfo is updated to reflect the emulated nodes.
- *
- * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
- *   emulated nodes.
- *
- * - NUMA distance table is rebuilt to represent distances between emulated
- *   nodes.  The distances are determined considering how emulated nodes
- *   are mapped to physical nodes and match the actual distances.
- *
- * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
- *   nodes.  This is used by numa_add_cpu() and numa_remove_cpu().
- *
- * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
- * identity mapping and no other modification is made.
- */
-void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dis=
t_cnt)
-{
-	static struct numa_meminfo ei __initdata;
-	static struct numa_meminfo pi __initdata;
-	const u64 max_addr =3D PFN_PHYS(max_pfn);
-	u8 *phys_dist =3D NULL;
-	size_t phys_size =3D numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
-	int max_emu_nid, dfl_phys_nid;
-	int i, j, ret;
-
-	if (!emu_cmdline)
-		goto no_emu;
-
-	memset(&ei, 0, sizeof(ei));
-	pi =3D *numa_meminfo;
-
-	for (i =3D 0; i < MAX_NUMNODES; i++)
-		emu_nid_to_phys[i] =3D NUMA_NO_NODE;
-
-	/*
-	 * If the numa=3Dfake command-line contains a 'M' or 'G', it represents
-	 * the fixed node size.  Otherwise, if it is just a single number N,
-	 * split the system RAM into N fake nodes.
-	 */
-	if (strchr(emu_cmdline, 'U')) {
-		nodemask_t physnode_mask =3D numa_nodes_parsed;
-		unsigned long n;
-		int nid =3D 0;
-
-		n =3D simple_strtoul(emu_cmdline, &emu_cmdline, 0);
-		ret =3D -1;
-		for_each_node_mask(i, physnode_mask) {
-			/*
-			 * The reason we pass in blk[0] is due to
-			 * numa_remove_memblk_from() called by
-			 * emu_setup_memblk() will delete entry 0
-			 * and then move everything else up in the pi.blk
-			 * array. Therefore we should always be looking
-			 * at blk[0].
-			 */
-			ret =3D split_nodes_size_interleave_uniform(&ei, &pi,
-					pi.blk[0].start, pi.blk[0].end, 0,
-					n, &pi.blk[0], nid);
-			if (ret < 0)
-				break;
-			if (ret < n) {
-				pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
-						__func__, i, ret, n);
-				ret =3D -1;
-				break;
-			}
-			nid =3D ret;
-		}
-	} else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
-		u64 size;
-
-		size =3D memparse(emu_cmdline, &emu_cmdline);
-		ret =3D split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
-	} else {
-		unsigned long n;
-
-		n =3D simple_strtoul(emu_cmdline, &emu_cmdline, 0);
-		ret =3D split_nodes_interleave(&ei, &pi, 0, max_addr, n);
-	}
-	if (*emu_cmdline =3D=3D ':')
-		emu_cmdline++;
-
-	if (ret < 0)
-		goto no_emu;
-
-	if (numa_cleanup_meminfo(&ei) < 0) {
-		pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation=
\n");
-		goto no_emu;
-	}
-
-	/* copy the physical distance table */
-	if (numa_dist_cnt) {
-		u64 phys;
-
-		phys =3D memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0,
-						 PFN_PHYS(max_pfn_mapped));
-		if (!phys) {
-			pr_warn("NUMA: Warning: can't allocate copy of distance table, disablin=
g emulation\n");
-			goto no_emu;
-		}
-		phys_dist =3D __va(phys);
-
-		for (i =3D 0; i < numa_dist_cnt; i++)
-			for (j =3D 0; j < numa_dist_cnt; j++)
-				phys_dist[i * numa_dist_cnt + j] =3D
-					node_distance(i, j);
-	}
-
-	/*
-	 * Determine the max emulated nid and the default phys nid to use
-	 * for unmapped nodes.
-	 */
-	max_emu_nid =3D setup_emu2phys_nid(&dfl_phys_nid);
-
-	/* commit */
-	*numa_meminfo =3D ei;
-
-	/* Make sure numa_nodes_parsed only contains emulated nodes */
-	nodes_clear(numa_nodes_parsed);
-	for (i =3D 0; i < ARRAY_SIZE(ei.blk); i++)
-		if (ei.blk[i].start !=3D ei.blk[i].end &&
-		    ei.blk[i].nid !=3D NUMA_NO_NODE)
-			node_set(ei.blk[i].nid, numa_nodes_parsed);
-
-	/*
-	 * Transform __apicid_to_node table to use emulated nids by
-	 * reverse-mapping phys_nid.  The maps should always exist but fall
-	 * back to zero just in case.
-	 */
-	for (i =3D 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
-		if (__apicid_to_node[i] =3D=3D NUMA_NO_NODE)
-			continue;
-		for (j =3D 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
-			if (__apicid_to_node[i] =3D=3D emu_nid_to_phys[j])
-				break;
-		__apicid_to_node[i] =3D j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
-	}
-
-	/* make sure all emulated nodes are mapped to a physical node */
-	for (i =3D 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
-		if (emu_nid_to_phys[i] =3D=3D NUMA_NO_NODE)
-			emu_nid_to_phys[i] =3D dfl_phys_nid;
-
-	/* transform distance table */
-	numa_reset_distance();
-	for (i =3D 0; i < max_emu_nid + 1; i++) {
-		for (j =3D 0; j < max_emu_nid + 1; j++) {
-			int physi =3D emu_nid_to_phys[i];
-			int physj =3D emu_nid_to_phys[j];
-			int dist;
-
-			if (get_option(&emu_cmdline, &dist) =3D=3D 2)
-				;
-			else if (physi >=3D numa_dist_cnt || physj >=3D numa_dist_cnt)
-				dist =3D physi =3D=3D physj ?
-					LOCAL_DISTANCE : REMOTE_DISTANCE;
-			else
-				dist =3D phys_dist[physi * numa_dist_cnt + physj];
-
-			numa_set_distance(i, j, dist);
-		}
-	}
-
-	/* free the copied physical distance table */
-	memblock_free(phys_dist, phys_size);
-	return;
-
-no_emu:
-	/* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
-	for (i =3D 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
-		emu_nid_to_phys[i] =3D i;
-}
-
-#ifndef CONFIG_DEBUG_PER_CPU_MAPS
-void numa_add_cpu(int cpu)
-{
-	int physnid, nid;
-
-	nid =3D early_cpu_to_node(cpu);
-	BUG_ON(nid =3D=3D NUMA_NO_NODE || !node_online(nid));
-
-	physnid =3D emu_nid_to_phys[nid];
-
-	/*
-	 * Map the cpu to each emulated node that is allocated on the physical
-	 * node of the cpu's apic id.
-	 */
-	for_each_online_node(nid)
-		if (emu_nid_to_phys[nid] =3D=3D physnid)
-			cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
-}
-
-void numa_remove_cpu(int cpu)
-{
-	int i;
-
-	for_each_online_node(i)
-		cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
-}
-#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
-static void numa_set_cpumask(int cpu, bool enable)
-{
-	int nid, physnid;
-
-	nid =3D early_cpu_to_node(cpu);
-	if (nid =3D=3D NUMA_NO_NODE) {
-		/* early_cpu_to_node() already emits a warning and trace */
-		return;
-	}
-
-	physnid =3D emu_nid_to_phys[nid];
-
-	for_each_online_node(nid) {
-		if (emu_nid_to_phys[nid] !=3D physnid)
-			continue;
-
-		debug_cpumask_set_cpu(cpu, nid, enable);
-	}
-}
-
-void numa_add_cpu(int cpu)
-{
-	numa_set_cpumask(cpu, true);
-}
-
-void numa_remove_cpu(int cpu)
-{
-	numa_set_cpumask(cpu, false);
-}
-#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index 5df0ad5cb09d..67bdbcd0caf9 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/of.h>
=20
+#include <asm-generic/numa.h>
 #include <asm/sections.h>
=20
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
@@ -30,6 +31,8 @@ static __init int numa_parse_early_param(char *opt)
 		return -EINVAL;
 	if (str_has_prefix(opt, "off"))
 		numa_off =3D true;
+	if (!strncmp(opt, "fake=3D", 5))
+		return numa_emu_cmdline(opt + 5);
=20
 	return 0;
 }
diff --git a/include/asm-generic/numa.h b/include/asm-generic/numa.h
index 929d7c582a73..4658155a070a 100644
--- a/include/asm-generic/numa.h
+++ b/include/asm-generic/numa.h
@@ -50,12 +50,24 @@ struct numa_meminfo {
 	struct numa_memblk	blk[NR_NODE_MEMBLKS];
 };
=20
+#ifdef CONFIG_NUMA_EMU
+#define FAKE_NODE_MIN_SIZE	((u64)32 << 20)
+#define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1UL))
+
 extern struct numa_meminfo numa_meminfo;
+extern char *emu_cmdline __initdata;
=20
+int numa_emu_cmdline(char *str);
 int __init numa_register_memblks(struct numa_meminfo *mi);
 int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
 void __init numa_emulation(struct numa_meminfo *numa_meminfo,
 			   int numa_dist_cnt);
+#else
+static inline int numa_emu_cmdline(char *str)
+{
+	return -EINVAL;
+}
+#endif
=20
 #else	/* CONFIG_NUMA */
=20
diff --git a/mm/Kconfig b/mm/Kconfig
index 264a2df5ecf5..22bead675ee6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -549,6 +549,14 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
 config ARCH_ENABLE_MEMORY_HOTREMOVE
 	bool
=20
+config NUMA_EMU
+	bool "NUMA emulation (EXPERIMENTAL)"
+	depends on NUMA && (X86 || ARM64)
+	help
+	  Enable NUMA emulation. A flat machine will be split
+	  into virtual nodes when booted with "numa=3Dfake=3DN", where N is the
+	  number of nodes. This is only useful for debugging.
+
 # eventually, we can have this option just 'select SPARSEMEM'
 menuconfig MEMORY_HOTPLUG
 	bool "Memory hotplug"
diff --git a/mm/numa.c b/mm/numa.c
index 88277e8404f0..3cc01f06a2a6 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -16,6 +16,10 @@
 struct numa_meminfo numa_meminfo __initdata_or_meminfo;
 struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
=20
+#ifdef CONFIG_NUMA_EMU
+char *emu_cmdline __initdata;
+#endif
+
 /*
  * Set nodes, which have memory in @mi, in *@nodemask.
  */
@@ -296,3 +300,11 @@ int __weak __init numa_register_memblks(struct numa_me=
minfo *mi)
=20
 	return 0;
 }
+
+#ifdef CONFIG_NUMA_EMU
+int __init numa_emu_cmdline(char *str)
+{
+	emu_cmdline =3D str;
+	return 0;
+}
+#endif
--=20
2.32.0.3.gf3a3e56d6
From nobody Mon Feb  9 03:32:44 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id F3CCBCDB46E
	for <linux-kernel@archiver.kernel.org>; Thu, 12 Oct 2023 02:49:36 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1347057AbjJLCte (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 11 Oct 2023 22:49:34 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41086 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1376859AbjJLCs6 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 11 Oct 2023 22:48:58 -0400
Received: from out30-100.freemail.mail.aliyun.com
 (out30-100.freemail.mail.aliyun.com [115.124.30.100])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D30D9A4
        for <linux-kernel@vger.kernel.org>;
 Wed, 11 Oct 2023 19:48:55 -0700 (PDT)
X-Alimail-AntiSpam: 
 AC=PASS;BC=-1|-1;BR=01201311R171e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018046059;MF=rongwei.wang@linux.alibaba.com;NM=1;PH=DS;RN=9;SR=0;TI=SMTPD_---0VtykMhN_1697078932;
Received: from localhost.localdomain(mailfrom:rongwei.wang@linux.alibaba.com
 fp:SMTPD_---0VtykMhN_1697078932)
          by smtp.aliyun-inc.com;
          Thu, 12 Oct 2023 10:48:53 +0800
From: Rongwei Wang <rongwei.wang@linux.alibaba.com>
To: linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org,
        linux-mm@kvack.org
Cc: akpm@linux-foundation.org, willy@infradead.org,
        catalin.marinas@arm.com, dave.hansen@linux.intel.com,
        tj@kernel.org, mingo@redhat.com
Subject: [PATCH RFC 5/5] mm/numa: migrate leftover numa emulation into
 mm/numa.c
Date: Thu, 12 Oct 2023 10:48:42 +0800
Message-Id: <20231012024842.99703-6-rongwei.wang@linux.alibaba.com>
X-Mailer: git-send-email 2.40.0
In-Reply-To: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com>
References: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

Here moving original x86/mm/numa_emulation.c into
mm/numa.c. And next to enable it for arm64.

Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
---
 drivers/base/arch_numa.c   |   2 +
 include/asm-generic/numa.h |   3 +
 mm/numa.c                  | 586 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 587 insertions(+), 4 deletions(-)

diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index 67bdbcd0caf9..c6f5ceadb9e1 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -64,6 +64,7 @@ EXPORT_SYMBOL(cpumask_of_node);
=20
 #endif
=20
+#ifndef CONFIG_NUMA_EMU
 static void numa_update_cpu(unsigned int cpu, bool remove)
 {
 	int nid =3D cpu_to_node(cpu);
@@ -92,6 +93,7 @@ void numa_clear_node(unsigned int cpu)
 	numa_remove_cpu(cpu);
 	set_cpu_numa_node(cpu, NUMA_NO_NODE);
 }
+#endif
=20
 /*
  * Allocate node_to_cpumask_map based on number of available nodes
diff --git a/include/asm-generic/numa.h b/include/asm-generic/numa.h
index 4658155a070a..9969ec7f59a4 100644
--- a/include/asm-generic/numa.h
+++ b/include/asm-generic/numa.h
@@ -55,6 +55,7 @@ struct numa_meminfo {
 #define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1UL))
=20
 extern struct numa_meminfo numa_meminfo;
+extern int emu_nid_to_phys[MAX_NUMNODES];
 extern char *emu_cmdline __initdata;
=20
 int numa_emu_cmdline(char *str);
@@ -62,6 +63,8 @@ int __init numa_register_memblks(struct numa_meminfo *mi);
 int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
 void __init numa_emulation(struct numa_meminfo *numa_meminfo,
 			   int numa_dist_cnt);
+int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+			      struct numa_meminfo *mi);
 #else
 static inline int numa_emu_cmdline(char *str)
 {
diff --git a/mm/numa.c b/mm/numa.c
index 3cc01f06a2a6..a6e9652498c9 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
+/* Most of this file comes from x86/numa_emulation.c */
 #include <linux/acpi.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -16,10 +17,6 @@
 struct numa_meminfo numa_meminfo __initdata_or_meminfo;
 struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
=20
-#ifdef CONFIG_NUMA_EMU
-char *emu_cmdline __initdata;
-#endif
-
 /*
  * Set nodes, which have memory in @mi, in *@nodemask.
  */
@@ -302,9 +299,590 @@ int __weak __init numa_register_memblks(struct numa_m=
eminfo *mi)
 }
=20
 #ifdef CONFIG_NUMA_EMU
+int emu_nid_to_phys[MAX_NUMNODES];
+char *emu_cmdline __initdata;
+
 int __init numa_emu_cmdline(char *str)
 {
 	emu_cmdline =3D str;
 	return 0;
 }
+
+static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminf=
o *mi)
+{
+	int i;
+
+	for (i =3D 0; i < mi->nr_blks; i++)
+		if (mi->blk[i].nid =3D=3D nid)
+			return i;
+	return -ENOENT;
+}
+
+static u64 __init mem_hole_size(u64 start, u64 end)
+{
+	unsigned long start_pfn =3D PFN_UP(start);
+	unsigned long end_pfn =3D PFN_DOWN(end);
+
+	if (start_pfn < end_pfn)
+		return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
+	return 0;
+}
+
+/*
+ * Sets up nid to range from @start to @end.  The return value is -errno if
+ * something went wrong, 0 otherwise.
+ */
+static int __init emu_setup_memblk(struct numa_meminfo *ei,
+				   struct numa_meminfo *pi,
+				   int nid, int phys_blk, u64 size)
+{
+	struct numa_memblk *eb =3D &ei->blk[ei->nr_blks];
+	struct numa_memblk *pb =3D &pi->blk[phys_blk];
+
+	if (ei->nr_blks >=3D NR_NODE_MEMBLKS) {
+		pr_err("NUMA: Too many emulated memblks, failing emulation\n");
+		return -EINVAL;
+	}
+
+	ei->nr_blks++;
+	eb->start =3D pb->start;
+	eb->end =3D pb->start + size;
+	eb->nid =3D nid;
+
+	if (emu_nid_to_phys[nid] =3D=3D NUMA_NO_NODE)
+		emu_nid_to_phys[nid] =3D pb->nid;
+
+	pb->start +=3D size;
+	if (pb->start >=3D pb->end) {
+		WARN_ON_ONCE(pb->start > pb->end);
+		numa_remove_memblk_from(phys_blk, pi);
+	}
+
+	printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
+	       nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
+	return 0;
+}
+
+/*
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging fro=
m addr
+ * to max_addr.
+ *
+ * Returns zero on success or negative on error.
+ */
+static int __init split_nodes_interleave(struct numa_meminfo *ei,
+					 struct numa_meminfo *pi,
+					 u64 addr, u64 max_addr, int nr_nodes)
+{
+	nodemask_t physnode_mask =3D numa_nodes_parsed;
+	u64 size;
+	int big;
+	int nid =3D 0;
+	int i, ret;
+
+	if (nr_nodes <=3D 0)
+		return -1;
+	if (nr_nodes > MAX_NUMNODES) {
+		pr_info("numa=3Dfake=3D%d too large, reducing to %d\n",
+			nr_nodes, MAX_NUMNODES);
+		nr_nodes =3D MAX_NUMNODES;
+	}
+
+	/*
+	 * Calculate target node size.  x86_32 freaks on __udivdi3() so do
+	 * the division in ulong number of pages and convert back.
+	 */
+	size =3D max_addr - addr - mem_hole_size(addr, max_addr);
+	size =3D PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
+
+	/*
+	 * Calculate the number of big nodes that can be allocated as a result
+	 * of consolidating the remainder.
+	 */
+	big =3D ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
+		FAKE_NODE_MIN_SIZE;
+
+	size &=3D FAKE_NODE_MIN_HASH_MASK;
+	if (!size) {
+		pr_err("Not enough memory for each node.  "
+			"NUMA emulation disabled.\n");
+		return -1;
+	}
+
+	/*
+	 * Continue to fill physical nodes with fake nodes until there is no
+	 * memory left on any of them.
+	 */
+	while (!nodes_empty(physnode_mask)) {
+		for_each_node_mask(i, physnode_mask) {
+#ifdef CONFIG_X86
+			u64 dma32_end =3D PFN_PHYS(MAX_DMA32_PFN);
+#endif
+			u64 start, limit, end;
+			int phys_blk;
+
+			phys_blk =3D emu_find_memblk_by_nid(i, pi);
+			if (phys_blk < 0) {
+				node_clear(i, physnode_mask);
+				continue;
+			}
+			start =3D pi->blk[phys_blk].start;
+			limit =3D pi->blk[phys_blk].end;
+			end =3D start + size;
+
+			if (nid < big)
+				end +=3D FAKE_NODE_MIN_SIZE;
+
+			/*
+			 * Continue to add memory to this fake node if its
+			 * non-reserved memory is less than the per-node size.
+			 */
+			while (end - start - mem_hole_size(start, end) < size) {
+				end +=3D FAKE_NODE_MIN_SIZE;
+				if (end > limit) {
+					end =3D limit;
+					break;
+				}
+			}
+
+#ifdef CONFIG_X86
+			/*
+			 * If there won't be at least FAKE_NODE_MIN_SIZE of
+			 * non-reserved memory in ZONE_DMA32 for the next node,
+			 * this one must extend to the boundary.
+			 */
+			if (end < dma32_end && dma32_end - end -
+			    mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+				end =3D dma32_end;
+#endif
+
+			/*
+			 * If there won't be enough non-reserved memory for the
+			 * next node, this one must extend to the end of the
+			 * physical node.
+			 */
+			if (limit - end - mem_hole_size(end, limit) < size)
+				end =3D limit;
+
+			ret =3D emu_setup_memblk(ei, pi, nid++ % nr_nodes,
+					       phys_blk,
+					       min(end, limit) - start);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Returns the end address of a node so that there is at least `size' amou=
nt of
+ * non-reserved memory or `max_addr' is reached.
+ */
+static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
+{
+	u64 end =3D start + size;
+
+	while (end - start - mem_hole_size(start, end) < size) {
+		end +=3D FAKE_NODE_MIN_SIZE;
+		if (end > max_addr) {
+			end =3D max_addr;
+			break;
+		}
+	}
+	return end;
+}
+
+static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
+{
+	unsigned long max_pfn =3D PHYS_PFN(max_addr);
+	unsigned long base_pfn =3D PHYS_PFN(base);
+	unsigned long hole_pfns =3D PHYS_PFN(hole);
+
+	return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
+}
+
+/*
+ * Sets up fake nodes of `size' interleaved over physical nodes ranging fr=
om
+ * `addr' to `max_addr'.
+ *
+ * Returns zero on success or negative on error.
+ */
+static int __init split_nodes_size_interleave_uniform(struct numa_meminfo =
*ei,
+					      struct numa_meminfo *pi,
+					      u64 addr, u64 max_addr, u64 size,
+					      int nr_nodes, struct numa_memblk *pblk,
+					      int nid)
+{
+	nodemask_t physnode_mask =3D numa_nodes_parsed;
+	int i, ret, uniform =3D 0;
+	u64 min_size;
+
+	if ((!size && !nr_nodes) || (nr_nodes && !pblk))
+		return -1;
+
+	/*
+	 * In the 'uniform' case split the passed in physical node by
+	 * nr_nodes, in the non-uniform case, ignore the passed in
+	 * physical block and try to create nodes of at least size
+	 * @size.
+	 *
+	 * In the uniform case, split the nodes strictly by physical
+	 * capacity, i.e. ignore holes. In the non-uniform case account
+	 * for holes and treat @size as a minimum floor.
+	 */
+	if (!nr_nodes)
+		nr_nodes =3D MAX_NUMNODES;
+	else {
+		nodes_clear(physnode_mask);
+		node_set(pblk->nid, physnode_mask);
+		uniform =3D 1;
+	}
+
+	if (uniform) {
+		min_size =3D uniform_size(max_addr, addr, 0, nr_nodes);
+		size =3D min_size;
+	} else {
+		/*
+		 * The limit on emulated nodes is MAX_NUMNODES, so the
+		 * size per node is increased accordingly if the
+		 * requested size is too small.  This creates a uniform
+		 * distribution of node sizes across the entire machine
+		 * (but not necessarily over physical nodes).
+		 */
+		min_size =3D uniform_size(max_addr, addr,
+				mem_hole_size(addr, max_addr), nr_nodes);
+	}
+	min_size =3D ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
+	if (size < min_size) {
+		pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
+			size >> 20, min_size >> 20);
+		size =3D min_size;
+	}
+	size =3D ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
+
+	/*
+	 * Fill physical nodes with fake nodes of size until there is no memory
+	 * left on any of them.
+	 */
+	while (!nodes_empty(physnode_mask)) {
+		for_each_node_mask(i, physnode_mask) {
+#ifdef CONFIG_X86
+			u64 dma32_end =3D PFN_PHYS(MAX_DMA32_PFN);
+#endif
+			u64 start, limit, end;
+			int phys_blk;
+
+			phys_blk =3D emu_find_memblk_by_nid(i, pi);
+			if (phys_blk < 0) {
+				node_clear(i, physnode_mask);
+				continue;
+			}
+
+			start =3D pi->blk[phys_blk].start;
+			limit =3D pi->blk[phys_blk].end;
+
+			if (uniform)
+				end =3D start + size;
+			else
+				end =3D find_end_of_node(start, limit, size);
+
+#ifdef CONFIG_X86
+			/*
+			 * If there won't be at least FAKE_NODE_MIN_SIZE of
+			 * non-reserved memory in ZONE_DMA32 for the next node,
+			 * this one must extend to the boundary.
+			 */
+			if (end < dma32_end && dma32_end - end -
+			    mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+				end =3D dma32_end;
+#endif
+
+			/*
+			 * If there won't be enough non-reserved memory for the
+			 * next node, this one must extend to the end of the
+			 * physical node.
+			 */
+			if ((limit - end - mem_hole_size(end, limit) < size)
+					&& !uniform)
+				end =3D limit;
+
+			ret =3D emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
+					       phys_blk,
+					       min(end, limit) - start);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return nid;
+}
+
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+					      struct numa_meminfo *pi,
+					      u64 addr, u64 max_addr, u64 size)
+{
+	return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
+			0, NULL, 0);
+}
+
+static int __init setup_emu2phys_nid(int *dfl_phys_nid)
+{
+	int i, max_emu_nid =3D 0;
+
+	*dfl_phys_nid =3D NUMA_NO_NODE;
+	for (i =3D 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+		if (emu_nid_to_phys[i] !=3D NUMA_NO_NODE) {
+			max_emu_nid =3D i;
+			if (*dfl_phys_nid =3D=3D NUMA_NO_NODE)
+				*dfl_phys_nid =3D emu_nid_to_phys[i];
+		}
+	}
+
+	return max_emu_nid;
+}
+
+/**
+ * numa_emulation - Emulate NUMA nodes
+ * @numa_meminfo: NUMA configuration to massage
+ * @numa_dist_cnt: The size of the physical NUMA distance table
+ *
+ * Emulate NUMA nodes according to the numa=3Dfake kernel parameter.
+ * @numa_meminfo contains the physical memory configuration and is modified
+ * to reflect the emulated configuration on success.  @numa_dist_cnt is
+ * used to determine the size of the physical distance table.
+ *
+ * On success, the following modifications are made.
+ *
+ * - @numa_meminfo is updated to reflect the emulated nodes.
+ *
+ * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
+ *   emulated nodes.
+ *
+ * - NUMA distance table is rebuilt to represent distances between emulated
+ *   nodes.  The distances are determined considering how emulated nodes
+ *   are mapped to physical nodes and match the actual distances.
+ *
+ * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
+ *   nodes.  This is used by numa_add_cpu() and numa_remove_cpu().
+ *
+ * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
+ * identity mapping and no other modification is made.
+ */
+void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dis=
t_cnt)
+{
+	static struct numa_meminfo ei __initdata;
+	static struct numa_meminfo pi __initdata;
+	const u64 max_addr =3D PFN_PHYS(max_pfn);
+	u8 *phys_dist =3D NULL;
+	size_t phys_size =3D numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
+	int max_emu_nid, dfl_phys_nid;
+	int i, j, ret;
+
+	if (!emu_cmdline)
+		goto no_emu;
+
+	memset(&ei, 0, sizeof(ei));
+	pi =3D *numa_meminfo;
+
+	for (i =3D 0; i < MAX_NUMNODES; i++)
+		emu_nid_to_phys[i] =3D NUMA_NO_NODE;
+
+	/*
+	 * If the numa=3Dfake command-line contains a 'M' or 'G', it represents
+	 * the fixed node size.  Otherwise, if it is just a single number N,
+	 * split the system RAM into N fake nodes.
+	 */
+	if (strchr(emu_cmdline, 'U')) {
+		nodemask_t physnode_mask =3D numa_nodes_parsed;
+		unsigned long n;
+		int nid =3D 0;
+
+		n =3D simple_strtoul(emu_cmdline, &emu_cmdline, 0);
+		ret =3D -1;
+		for_each_node_mask(i, physnode_mask) {
+			/*
+			 * The reason we pass in blk[0] is due to
+			 * numa_remove_memblk_from() called by
+			 * emu_setup_memblk() will delete entry 0
+			 * and then move everything else up in the pi.blk
+			 * array. Therefore we should always be looking
+			 * at blk[0].
+			 */
+			ret =3D split_nodes_size_interleave_uniform(&ei, &pi,
+					pi.blk[0].start, pi.blk[0].end, 0,
+					n, &pi.blk[0], nid);
+			if (ret < 0)
+				break;
+			if (ret < n) {
+				pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
+						__func__, i, ret, n);
+				ret =3D -1;
+				break;
+			}
+			nid =3D ret;
+		}
+	} else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
+		u64 size;
+
+		size =3D memparse(emu_cmdline, &emu_cmdline);
+		ret =3D split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
+	} else {
+		unsigned long n;
+
+		n =3D simple_strtoul(emu_cmdline, &emu_cmdline, 0);
+		ret =3D split_nodes_interleave(&ei, &pi, 0, max_addr, n);
+	}
+	if (*emu_cmdline =3D=3D ':')
+		emu_cmdline++;
+
+	if (ret < 0)
+		goto no_emu;
+
+	if (numa_cleanup_meminfo(&ei) < 0) {
+		pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation=
\n");
+		goto no_emu;
+	}
+
+	/* copy the physical distance table */
+	if (numa_dist_cnt) {
+		u64 phys;
+
+		phys =3D memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0,
+						 MEMBLOCK_ALLOC_ACCESSIBLE);
+		if (!phys) {
+			pr_warn("NUMA: Warning: can't allocate copy of distance table, disablin=
g emulation\n");
+			goto no_emu;
+		}
+		phys_dist =3D __va(phys);
+
+		for (i =3D 0; i < numa_dist_cnt; i++)
+			for (j =3D 0; j < numa_dist_cnt; j++)
+				phys_dist[i * numa_dist_cnt + j] =3D
+					node_distance(i, j);
+	}
+
+	/*
+	 * Determine the max emulated nid and the default phys nid to use
+	 * for unmapped nodes.
+	 */
+	max_emu_nid =3D setup_emu2phys_nid(&dfl_phys_nid);
+
+	/* commit */
+	*numa_meminfo =3D ei;
+
+	/* Make sure numa_nodes_parsed only contains emulated nodes */
+	nodes_clear(numa_nodes_parsed);
+	for (i =3D 0; i < ARRAY_SIZE(ei.blk); i++)
+		if (ei.blk[i].start !=3D ei.blk[i].end &&
+		    ei.blk[i].nid !=3D NUMA_NO_NODE)
+			node_set(ei.blk[i].nid, numa_nodes_parsed);
+
+#ifdef CONFIG_X86
+	/*
+	 * Transform __apicid_to_node table to use emulated nids by
+	 * reverse-mapping phys_nid.  The maps should always exist but fall
+	 * back to zero just in case.
+	 */
+	for (i =3D 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
+		if (__apicid_to_node[i] =3D=3D NUMA_NO_NODE)
+			continue;
+		for (j =3D 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
+			if (__apicid_to_node[i] =3D=3D emu_nid_to_phys[j])
+				break;
+		__apicid_to_node[i] =3D j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
+	}
+#endif
+
+	/* make sure all emulated nodes are mapped to a physical node */
+	for (i =3D 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+		if (emu_nid_to_phys[i] =3D=3D NUMA_NO_NODE)
+			emu_nid_to_phys[i] =3D dfl_phys_nid;
+
+	/* transform distance table */
+	numa_free_distance();
+	for (i =3D 0; i < max_emu_nid + 1; i++) {
+		for (j =3D 0; j < max_emu_nid + 1; j++) {
+			int physi =3D emu_nid_to_phys[i];
+			int physj =3D emu_nid_to_phys[j];
+			int dist;
+
+			if (get_option(&emu_cmdline, &dist) =3D=3D 2)
+				;
+			else if (physi >=3D numa_dist_cnt || physj >=3D numa_dist_cnt)
+				dist =3D physi =3D=3D physj ?
+					LOCAL_DISTANCE : REMOTE_DISTANCE;
+			else
+				dist =3D phys_dist[physi * numa_dist_cnt + physj];
+
+			numa_set_distance(i, j, dist);
+		}
+	}
+
+	/* free the copied physical distance table */
+	memblock_free(phys_dist, phys_size);
+	return;
+
+no_emu:
+	/* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
+	for (i =3D 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+		emu_nid_to_phys[i] =3D i;
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+extern int early_cpu_to_node(unsigned int cpu);
+
+void numa_add_cpu(unsigned int cpu)
+{
+	int physnid, nid;
+
+	nid =3D early_cpu_to_node(cpu);
+	BUG_ON(nid =3D=3D NUMA_NO_NODE || !node_online(nid));
+
+	physnid =3D emu_nid_to_phys[nid];
+
+	/*
+	 * Map the cpu to each emulated node that is allocated on the physical
+	 * node of the cpu's apic id.
+	 */
+	for_each_online_node(nid)
+		if (emu_nid_to_phys[nid] =3D=3D physnid)
+			cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+}
+
+void numa_remove_cpu(unsigned int cpu)
+{
+	int i;
+
+	for_each_online_node(i)
+		cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
+static void numa_set_cpumask(int cpu, bool enable)
+{
+	int nid, physnid;
+
+	nid =3D early_cpu_to_node(cpu);
+	if (nid =3D=3D NUMA_NO_NODE) {
+		/* early_cpu_to_node() already emits a warning and trace */
+		return;
+	}
+
+	physnid =3D emu_nid_to_phys[nid];
+
+	for_each_online_node(nid) {
+		if (emu_nid_to_phys[nid] !=3D physnid)
+			continue;
+
+		debug_cpumask_set_cpu(cpu, nid, enable);
+	}
+}
+
+void numa_add_cpu(unsigned int cpu)
+{
+	numa_set_cpumask(cpu, true);
+}
+
+void numa_remove_cpu(unsigned int cpu)
+{
+	numa_set_cpumask(cpu, false);
+}
+#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
 #endif
--=20
2.32.0.3.gf3a3e56d6