From nobody Fri Jan 2 11:57:02 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 74D15CDB465 for ; Thu, 12 Oct 2023 02:49:26 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1376914AbjJLCtZ (ORCPT ); Wed, 11 Oct 2023 22:49:25 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56930 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S235280AbjJLCsy (ORCPT ); Wed, 11 Oct 2023 22:48:54 -0400 Received: from out30-97.freemail.mail.aliyun.com (out30-97.freemail.mail.aliyun.com [115.124.30.97]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 94750A4 for ; Wed, 11 Oct 2023 19:48:51 -0700 (PDT) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R111e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018045168;MF=rongwei.wang@linux.alibaba.com;NM=1;PH=DS;RN=9;SR=0;TI=SMTPD_---0VtykMfs_1697078928; Received: from localhost.localdomain(mailfrom:rongwei.wang@linux.alibaba.com fp:SMTPD_---0VtykMfs_1697078928) by smtp.aliyun-inc.com; Thu, 12 Oct 2023 10:48:48 +0800 From: Rongwei Wang To: linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org Cc: akpm@linux-foundation.org, willy@infradead.org, catalin.marinas@arm.com, dave.hansen@linux.intel.com, tj@kernel.org, mingo@redhat.com Subject: [PATCH RFC 1/5] mm/numa: move numa emulation APIs into generic files Date: Thu, 12 Oct 2023 10:48:38 +0800 Message-Id: <20231012024842.99703-2-rongwei.wang@linux.alibaba.com> X-Mailer: git-send-email 2.40.0 In-Reply-To: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com> References: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" In order to support NUMA EMU for other arch, some functions that used by numa_meminfo should be moved out x86 arch. mm/numa.c created to place above API. CONFIG_NUMA_EMU will be handled later. Signed-off-by: Rongwei Wang --- arch/x86/include/asm/numa.h | 3 - arch/x86/mm/numa.c | 216 +------------------------- arch/x86/mm/numa_internal.h | 14 +- include/asm-generic/numa.h | 18 +++ mm/Makefile | 1 + mm/numa.c | 298 ++++++++++++++++++++++++++++++++++++ 6 files changed, 323 insertions(+), 227 deletions(-) create mode 100644 mm/numa.c diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index e3bae2b60a0d..8d79be8095d5 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -9,9 +9,6 @@ #include =20 #ifdef CONFIG_NUMA - -#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) - /* * Too small node sizes may confuse the VM badly. Usually they * result from BIOS bugs. So dont recognize nodes as standalone diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 2aadb2019b4f..969b11fff03f 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -25,8 +25,8 @@ nodemask_t numa_nodes_parsed __initdata; struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); =20 -static struct numa_meminfo numa_meminfo __initdata_or_meminfo; -static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; +extern struct numa_meminfo numa_meminfo; +extern struct numa_meminfo numa_reserved_meminfo; =20 static int numa_distance_cnt; static u8 *numa_distance; @@ -148,34 +148,6 @@ static int __init numa_add_memblk_to(int nid, u64 star= t, u64 end, return 0; } =20 -/** - * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo - * @idx: Index of memblk to remove - * @mi: numa_meminfo to remove memblk from - * - * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and - * decrementing @mi->nr_blks. - */ -void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) -{ - mi->nr_blks--; - memmove(&mi->blk[idx], &mi->blk[idx + 1], - (mi->nr_blks - idx) * sizeof(mi->blk[0])); -} - -/** - * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to ano= ther - * @dst: numa_meminfo to append block to - * @idx: Index of memblk to remove - * @src: numa_meminfo to remove memblk from - */ -static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, - struct numa_meminfo *src) -{ - dst->blk[dst->nr_blks++] =3D src->blk[idx]; - numa_remove_memblk_from(idx, src); -} - /** * numa_add_memblk - Add one numa_memblk to numa_meminfo * @nid: NUMA node ID of the new memblk @@ -225,124 +197,6 @@ static void __init alloc_node_data(int nid) node_set_online(nid); } =20 -/** - * numa_cleanup_meminfo - Cleanup a numa_meminfo - * @mi: numa_meminfo to clean up - * - * Sanitize @mi by merging and removing unnecessary memblks. Also check f= or - * conflicts and clear unused memblks. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init numa_cleanup_meminfo(struct numa_meminfo *mi) -{ - const u64 low =3D 0; - const u64 high =3D PFN_PHYS(max_pfn); - int i, j, k; - - /* first, trim all entries */ - for (i =3D 0; i < mi->nr_blks; i++) { - struct numa_memblk *bi =3D &mi->blk[i]; - - /* move / save reserved memory ranges */ - if (!memblock_overlaps_region(&memblock.memory, - bi->start, bi->end - bi->start)) { - numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); - continue; - } - - /* make sure all non-reserved blocks are inside the limits */ - bi->start =3D max(bi->start, low); - - /* preserve info for non-RAM areas above 'max_pfn': */ - if (bi->end > high) { - numa_add_memblk_to(bi->nid, high, bi->end, - &numa_reserved_meminfo); - bi->end =3D high; - } - - /* and there's no empty block */ - if (bi->start >=3D bi->end) - numa_remove_memblk_from(i--, mi); - } - - /* merge neighboring / overlapping entries */ - for (i =3D 0; i < mi->nr_blks; i++) { - struct numa_memblk *bi =3D &mi->blk[i]; - - for (j =3D i + 1; j < mi->nr_blks; j++) { - struct numa_memblk *bj =3D &mi->blk[j]; - u64 start, end; - - /* - * See whether there are overlapping blocks. Whine - * about but allow overlaps of the same nid. They - * will be merged below. - */ - if (bi->end > bj->start && bi->start < bj->end) { - if (bi->nid !=3D bj->nid) { - pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#01= 0Lx-%#010Lx]\n", - bi->nid, bi->start, bi->end - 1, - bj->nid, bj->start, bj->end - 1); - return -EINVAL; - } - pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [= mem %#010Lx-%#010Lx]\n", - bi->nid, bi->start, bi->end - 1, - bj->start, bj->end - 1); - } - - /* - * Join together blocks on the same node, holes - * between which don't overlap with memory on other - * nodes. - */ - if (bi->nid !=3D bj->nid) - continue; - start =3D min(bi->start, bj->start); - end =3D max(bi->end, bj->end); - for (k =3D 0; k < mi->nr_blks; k++) { - struct numa_memblk *bk =3D &mi->blk[k]; - - if (bi->nid =3D=3D bk->nid) - continue; - if (start < bk->end && end > bk->start) - break; - } - if (k < mi->nr_blks) - continue; - printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#= 010Lx] -> [mem %#010Lx-%#010Lx]\n", - bi->nid, bi->start, bi->end - 1, bj->start, - bj->end - 1, start, end - 1); - bi->start =3D start; - bi->end =3D end; - numa_remove_memblk_from(j--, mi); - } - } - - /* clear unused ones */ - for (i =3D mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { - mi->blk[i].start =3D mi->blk[i].end =3D 0; - mi->blk[i].nid =3D NUMA_NO_NODE; - } - - return 0; -} - -/* - * Set nodes, which have memory in @mi, in *@nodemask. - */ -static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, - const struct numa_meminfo *mi) -{ - int i; - - for (i =3D 0; i < ARRAY_SIZE(mi->blk); i++) - if (mi->blk[i].start !=3D mi->blk[i].end && - mi->blk[i].nid !=3D NUMA_NO_NODE) - node_set(mi->blk[i].nid, *nodemask); -} - /** * numa_reset_distance - Reset NUMA distance table * @@ -478,72 +332,6 @@ static bool __init numa_meminfo_cover_memory(const str= uct numa_meminfo *mi) return true; } =20 -/* - * Mark all currently memblock-reserved physical memory (which covers the - * kernel's own memory ranges) as hot-unswappable. - */ -static void __init numa_clear_kernel_node_hotplug(void) -{ - nodemask_t reserved_nodemask =3D NODE_MASK_NONE; - struct memblock_region *mb_region; - int i; - - /* - * We have to do some preprocessing of memblock regions, to - * make them suitable for reservation. - * - * At this time, all memory regions reserved by memblock are - * used by the kernel, but those regions are not split up - * along node boundaries yet, and don't necessarily have their - * node ID set yet either. - * - * So iterate over all memory known to the x86 architecture, - * and use those ranges to set the nid in memblock.reserved. - * This will split up the memblock regions along node - * boundaries and will set the node IDs as well. - */ - for (i =3D 0; i < numa_meminfo.nr_blks; i++) { - struct numa_memblk *mb =3D numa_meminfo.blk + i; - int ret; - - ret =3D memblock_set_node(mb->start, mb->end - mb->start, &memblock.rese= rved, mb->nid); - WARN_ON_ONCE(ret); - } - - /* - * Now go over all reserved memblock regions, to construct a - * node mask of all kernel reserved memory areas. - * - * [ Note, when booting with mem=3Dnn[kMG] or in a kdump kernel, - * numa_meminfo might not include all memblock.reserved - * memory ranges, because quirks such as trim_snb_memory() - * reserve specific pages for Sandy Bridge graphics. ] - */ - for_each_reserved_mem_region(mb_region) { - int nid =3D memblock_get_region_node(mb_region); - - if (nid !=3D MAX_NUMNODES) - node_set(nid, reserved_nodemask); - } - - /* - * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory - * belonging to the reserved node mask. - * - * Note that this will include memory regions that reside - * on nodes that contain kernel memory - entire nodes - * become hot-unpluggable: - */ - for (i =3D 0; i < numa_meminfo.nr_blks; i++) { - struct numa_memblk *mb =3D numa_meminfo.blk + i; - - if (!node_isset(mb->nid, reserved_nodemask)) - continue; - - memblock_clear_hotplug(mb->start, mb->end - mb->start); - } -} - static int __init numa_register_memblks(struct numa_meminfo *mi) { int i, nid; diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index 86860f279662..b6053beb81b1 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -16,19 +16,13 @@ struct numa_meminfo { struct numa_memblk blk[NR_NODE_MEMBLKS]; }; =20 -void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); -int __init numa_cleanup_meminfo(struct numa_meminfo *mi); +extern int __init numa_cleanup_meminfo(struct numa_meminfo *mi); void __init numa_reset_distance(void); =20 void __init x86_numa_init(void); =20 -#ifdef CONFIG_NUMA_EMU -void __init numa_emulation(struct numa_meminfo *numa_meminfo, - int numa_dist_cnt); -#else -static inline void numa_emulation(struct numa_meminfo *numa_meminfo, - int numa_dist_cnt) -{ } -#endif +extern void __init numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt); + =20 #endif /* __X86_MM_NUMA_INTERNAL_H */ diff --git a/include/asm-generic/numa.h b/include/asm-generic/numa.h index 1a3ad6d29833..929d7c582a73 100644 --- a/include/asm-generic/numa.h +++ b/include/asm-generic/numa.h @@ -39,6 +39,24 @@ void numa_store_cpu_info(unsigned int cpu); void numa_add_cpu(unsigned int cpu); void numa_remove_cpu(unsigned int cpu); =20 +struct numa_memblk { + u64 start; + u64 end; + int nid; +}; + +struct numa_meminfo { + int nr_blks; + struct numa_memblk blk[NR_NODE_MEMBLKS]; +}; + +extern struct numa_meminfo numa_meminfo; + +int __init numa_register_memblks(struct numa_meminfo *mi); +int __init numa_cleanup_meminfo(struct numa_meminfo *mi); +void __init numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt); + #else /* CONFIG_NUMA */ =20 static inline void numa_store_cpu_info(unsigned int cpu) { } diff --git a/mm/Makefile b/mm/Makefile index ec65984e2ade..6fc1bd7c9f5b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) +=3D io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) +=3D bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) +=3D ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) +=3D shrinker_debug.o +obj-$(CONFIG_NUMA) +=3D numa.o diff --git a/mm/numa.c b/mm/numa.c new file mode 100644 index 000000000000..88277e8404f0 --- /dev/null +++ b/mm/numa.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct numa_meminfo numa_meminfo __initdata_or_meminfo; +struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i =3D 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start !=3D mi->blk[i].end && + mi->blk[i].nid !=3D NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *m= i) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to ano= ther + * @dst: numa_meminfo to append block to + * @idx: Index of memblk to remove + * @src: numa_meminfo to remove memblk from + */ +static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, + struct numa_meminfo *src) +{ + dst->blk[dst->nr_blks++] =3D src->blk[idx]; + numa_remove_memblk_from(idx, src); +} + +int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start =3D=3D end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >=3D MAX_NUMNODES) { + pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); + return 0; + } + + if (mi->nr_blks >=3D NR_NODE_MEMBLKS) { + pr_err("too many memblk ranges\n"); + return -EINVAL; + } + + mi->blk[mi->nr_blks].start =3D start; + mi->blk[mi->nr_blks].end =3D end; + mi->blk[mi->nr_blks].nid =3D nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unnecessary memblks. Also check f= or + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +{ + const u64 low =3D 0; + const u64 high =3D PFN_PHYS(max_pfn); + int i, j, k; + + /* first, trim all entries */ + for (i =3D 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi =3D &mi->blk[i]; + + /* move / save reserved memory ranges */ + if (!memblock_overlaps_region(&memblock.memory, + bi->start, bi->end - bi->start)) { + numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); + continue; + } + + /* make sure all non-reserved blocks are inside the limits */ + bi->start =3D max(bi->start, low); + + /* preserve info for non-RAM areas above 'max_pfn': */ + if (bi->end > high) { + numa_add_memblk_to(bi->nid, high, bi->end, + &numa_reserved_meminfo); + bi->end =3D high; + } + + /* and there's no empty block */ + if (bi->start >=3D bi->end) + numa_remove_memblk_from(i--, mi); + } + + /* merge neighboring / overlapping entries */ + for (i =3D 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi =3D &mi->blk[i]; + + for (j =3D i + 1; j < mi->nr_blks; j++) { + struct numa_memblk *bj =3D &mi->blk[j]; + u64 start, end; + + /* + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. + */ + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid !=3D bj->nid) { + pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#01= 0Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->nid, bj->start, bj->end - 1); + return -EINVAL; + } + pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [= mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->start, bj->end - 1); + } + + /* + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. + */ + if (bi->nid !=3D bj->nid) + continue; + start =3D min(bi->start, bj->start); + end =3D max(bi->end, bj->end); + for (k =3D 0; k < mi->nr_blks; k++) { + struct numa_memblk *bk =3D &mi->blk[k]; + + if (bi->nid =3D=3D bk->nid) + continue; + if (start < bk->end && end > bk->start) + break; + } + if (k < mi->nr_blks) + continue; + printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#= 010Lx] -> [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, bj->start, + bj->end - 1, start, end - 1); + bi->start =3D start; + bi->end =3D end; + numa_remove_memblk_from(j--, mi); + } + } + + /* clear unused ones */ + for (i =3D mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start =3D mi->blk[i].end =3D 0; + mi->blk[i].nid =3D NUMA_NO_NODE; + } + + return 0; +} + +/* + * Mark all currently memblock-reserved physical memory (which covers the + * kernel's own memory ranges) as hot-unswappable. + */ +static void __init numa_clear_kernel_node_hotplug(void) +{ + nodemask_t reserved_nodemask =3D NODE_MASK_NONE; + struct memblock_region *mb_region; + int i; + + /* + * We have to do some preprocessing of memblock regions, to + * make them suitable for reservation. + * + * At this time, all memory regions reserved by memblock are + * used by the kernel, but those regions are not split up + * along node boundaries yet, and don't necessarily have their + * node ID set yet either. + * + * So iterate over all memory known to the x86 architecture, + * and use those ranges to set the nid in memblock.reserved. + * This will split up the memblock regions along node + * boundaries and will set the node IDs as well. + */ + for (i =3D 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb =3D numa_meminfo.blk + i; + int ret; + + ret =3D memblock_set_node(mb->start, mb->end - mb->start, &memblock.rese= rved, mb->nid); + WARN_ON_ONCE(ret); + } + + /* + * Now go over all reserved memblock regions, to construct a + * node mask of all kernel reserved memory areas. + * + * [ Note, when booting with mem=3Dnn[kMG] or in a kdump kernel, + * numa_meminfo might not include all memblock.reserved + * memory ranges, because quirks such as trim_snb_memory() + * reserve specific pages for Sandy Bridge graphics. ] + */ + for_each_reserved_mem_region(mb_region) { + int nid =3D memblock_get_region_node(mb_region); + + if (nid !=3D MAX_NUMNODES) + node_set(nid, reserved_nodemask); + } + + /* + * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory + * belonging to the reserved node mask. + * + * Note that this will include memory regions that reside + * on nodes that contain kernel memory - entire nodes + * become hot-unpluggable: + */ + for (i =3D 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb =3D numa_meminfo.blk + i; + + if (!node_isset(mb->nid, reserved_nodemask)) + continue; + + memblock_clear_hotplug(mb->start, mb->end - mb->start); + } +} + +int __weak __init numa_register_memblks(struct numa_meminfo *mi) +{ + int i; + + /* Account for nodes with cpus and no memory */ + node_possible_map =3D numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + for (i =3D 0; i < mi->nr_blks; i++) { + struct numa_memblk *mb =3D &mi->blk[i]; + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.memory, mb->nid); + } + + /* + * At very early time, the kernel have to use some memory such as + * loading the kernel image. We cannot prevent this anyway. So any + * node the kernel resides in should be un-hotpluggable. + * + * And when we come here, alloc node data won't fail. + */ + numa_clear_kernel_node_hotplug(); + + /* + * If sections array is gonna be used for pfn -> nid mapping, check + * whether its granularity is fine enough. + */ + if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { + unsigned long pfn_align =3D node_map_pfn_alignment(); + + if (pfn_align && pfn_align < PAGES_PER_SECTION) { + pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", + PFN_PHYS(pfn_align) >> 20, + PFN_PHYS(PAGES_PER_SECTION) >> 20); + return -EINVAL; + } + } + + return 0; +} --=20 2.32.0.3.gf3a3e56d6 From nobody Fri Jan 2 11:57:02 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 16D4BCDB465 for ; Thu, 12 Oct 2023 02:49:24 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S235315AbjJLCtX (ORCPT ); Wed, 11 Oct 2023 22:49:23 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56916 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S235276AbjJLCsy (ORCPT ); Wed, 11 Oct 2023 22:48:54 -0400 Received: from out30-124.freemail.mail.aliyun.com (out30-124.freemail.mail.aliyun.com [115.124.30.124]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 96508A9 for ; Wed, 11 Oct 2023 19:48:52 -0700 (PDT) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R131e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018045192;MF=rongwei.wang@linux.alibaba.com;NM=1;PH=DS;RN=9;SR=0;TI=SMTPD_---0VtykMgL_1697078929; Received: from localhost.localdomain(mailfrom:rongwei.wang@linux.alibaba.com fp:SMTPD_---0VtykMgL_1697078929) by smtp.aliyun-inc.com; Thu, 12 Oct 2023 10:48:49 +0800 From: Rongwei Wang To: linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org Cc: akpm@linux-foundation.org, willy@infradead.org, catalin.marinas@arm.com, dave.hansen@linux.intel.com, tj@kernel.org, mingo@redhat.com Subject: [PATCH RFC 2/5] mm: percpu: fix variable type of cpu Date: Thu, 12 Oct 2023 10:48:39 +0800 Message-Id: <20231012024842.99703-3-rongwei.wang@linux.alibaba.com> X-Mailer: git-send-email 2.40.0 In-Reply-To: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com> References: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Almost all places declare 'cpu' as 'unsigned int' type, but early_cpu_to_nod() not. So correct it in this patch. Signed-off-by: Rongwei Wang --- drivers/base/arch_numa.c | 2 +- include/linux/percpu.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index eaa31e567d1e..db0bb8b8fd67 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -144,7 +144,7 @@ void __init early_map_cpu_to_node(unsigned int cpu, int= nid) unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); =20 -static int __init early_cpu_to_node(int cpu) +static int __init early_cpu_to_node(unsigned int cpu) { return cpu_to_node_map[cpu]; } diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 68fac2e7cbe6..4aee8400af54 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -100,7 +100,7 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR]; =20 extern enum pcpu_fc pcpu_chosen_fc; =20 -typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu); +typedef int (pcpu_fc_cpu_to_node_fn_t)(unsigned int cpu); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to= ); =20 extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, --=20 2.32.0.3.gf3a3e56d6 From nobody Fri Jan 2 11:57:02 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id B31FFCDB465 for ; Thu, 12 Oct 2023 02:49:29 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1376938AbjJLCt2 (ORCPT ); Wed, 11 Oct 2023 22:49:28 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56932 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1376775AbjJLCsz (ORCPT ); Wed, 11 Oct 2023 22:48:55 -0400 Received: from out30-113.freemail.mail.aliyun.com (out30-113.freemail.mail.aliyun.com [115.124.30.113]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 42F78B6 for ; Wed, 11 Oct 2023 19:48:53 -0700 (PDT) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R561e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018046049;MF=rongwei.wang@linux.alibaba.com;NM=1;PH=DS;RN=9;SR=0;TI=SMTPD_---0VtykMgi_1697078930; Received: from localhost.localdomain(mailfrom:rongwei.wang@linux.alibaba.com fp:SMTPD_---0VtykMgi_1697078930) by smtp.aliyun-inc.com; Thu, 12 Oct 2023 10:48:51 +0800 From: Rongwei Wang To: linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org Cc: akpm@linux-foundation.org, willy@infradead.org, catalin.marinas@arm.com, dave.hansen@linux.intel.com, tj@kernel.org, mingo@redhat.com Subject: [PATCH RFC 3/5] arch_numa: remove __init in early_cpu_to_node() Date: Thu, 12 Oct 2023 10:48:40 +0800 Message-Id: <20231012024842.99703-4-rongwei.wang@linux.alibaba.com> X-Mailer: git-send-email 2.40.0 In-Reply-To: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com> References: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Most of arch does not stick '__init' for early_cpu_to_node(). And it's safe to delete this attribute here, ready for later numa emulation. Signed-off-by: Rongwei Wang --- drivers/base/arch_numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index db0bb8b8fd67..5df0ad5cb09d 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -144,7 +144,7 @@ void __init early_map_cpu_to_node(unsigned int cpu, int= nid) unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); =20 -static int __init early_cpu_to_node(unsigned int cpu) +int early_cpu_to_node(unsigned int cpu) { return cpu_to_node_map[cpu]; } --=20 2.32.0.3.gf3a3e56d6 From nobody Fri Jan 2 11:57:02 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id E00DECDB465 for ; Thu, 12 Oct 2023 02:49:32 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1376964AbjJLCtb (ORCPT ); Wed, 11 Oct 2023 22:49:31 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41070 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1376803AbjJLCs5 (ORCPT ); Wed, 11 Oct 2023 22:48:57 -0400 Received: from out30-113.freemail.mail.aliyun.com (out30-113.freemail.mail.aliyun.com [115.124.30.113]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 48BFCA9 for ; Wed, 11 Oct 2023 19:48:54 -0700 (PDT) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R111e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018046060;MF=rongwei.wang@linux.alibaba.com;NM=1;PH=DS;RN=9;SR=0;TI=SMTPD_---0VtykMh0_1697078931; Received: from localhost.localdomain(mailfrom:rongwei.wang@linux.alibaba.com fp:SMTPD_---0VtykMh0_1697078931) by smtp.aliyun-inc.com; Thu, 12 Oct 2023 10:48:52 +0800 From: Rongwei Wang To: linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org Cc: akpm@linux-foundation.org, willy@infradead.org, catalin.marinas@arm.com, dave.hansen@linux.intel.com, tj@kernel.org, mingo@redhat.com Subject: [PATCH RFC 4/5] mm/numa: support CONFIG_NUMA_EMU for arm64 Date: Thu, 12 Oct 2023 10:48:41 +0800 Message-Id: <20231012024842.99703-5-rongwei.wang@linux.alibaba.com> X-Mailer: git-send-email 2.40.0 In-Reply-To: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com> References: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" The CONFIG_NUMA_EMU migrates from x86/Kconfig to mm/Kconfig. Now x86 and arm64 support it. Signed-off-by: Rongwei Wang --- arch/x86/Kconfig | 8 - arch/x86/mm/Makefile | 1 - arch/x86/mm/numa_emulation.c | 585 ----------------------------------- drivers/base/arch_numa.c | 3 + include/asm-generic/numa.h | 12 + mm/Kconfig | 8 + mm/numa.c | 12 + 7 files changed, 35 insertions(+), 594 deletions(-) delete mode 100644 arch/x86/mm/numa_emulation.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 66bfabae8814..13438bfe2ec1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1568,14 +1568,6 @@ config X86_64_ACPI_NUMA help Enable ACPI SRAT based node topology detection. =20 -config NUMA_EMU - bool "NUMA emulation" - depends on NUMA - help - Enable NUMA emulation. A flat machine will be split - into virtual nodes when booted with "numa=3Dfake=3DN", where N is the - number of nodes. This is only useful for debugging. - config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP range 1 10 diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index c80febc44cd2..1581f17e5de4 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -56,7 +56,6 @@ obj-$(CONFIG_MMIOTRACE_TEST) +=3D testmmiotrace.o obj-$(CONFIG_NUMA) +=3D numa.o numa_$(BITS).o obj-$(CONFIG_AMD_NUMA) +=3D amdtopology.o obj-$(CONFIG_ACPI_NUMA) +=3D srat.o -obj-$(CONFIG_NUMA_EMU) +=3D numa_emulation.o =20 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) +=3D pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) +=3D kaslr.o diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c deleted file mode 100644 index 9a9305367fdd..000000000000 --- a/arch/x86/mm/numa_emulation.c +++ /dev/null @@ -1,585 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NUMA emulation - */ -#include -#include -#include -#include -#include - -#include "numa_internal.h" - -static int emu_nid_to_phys[MAX_NUMNODES]; -static char *emu_cmdline __initdata; - -int __init numa_emu_cmdline(char *str) -{ - emu_cmdline =3D str; - return 0; -} - -static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminf= o *mi) -{ - int i; - - for (i =3D 0; i < mi->nr_blks; i++) - if (mi->blk[i].nid =3D=3D nid) - return i; - return -ENOENT; -} - -static u64 __init mem_hole_size(u64 start, u64 end) -{ - unsigned long start_pfn =3D PFN_UP(start); - unsigned long end_pfn =3D PFN_DOWN(end); - - if (start_pfn < end_pfn) - return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); - return 0; -} - -/* - * Sets up nid to range from @start to @end. The return value is -errno if - * something went wrong, 0 otherwise. - */ -static int __init emu_setup_memblk(struct numa_meminfo *ei, - struct numa_meminfo *pi, - int nid, int phys_blk, u64 size) -{ - struct numa_memblk *eb =3D &ei->blk[ei->nr_blks]; - struct numa_memblk *pb =3D &pi->blk[phys_blk]; - - if (ei->nr_blks >=3D NR_NODE_MEMBLKS) { - pr_err("NUMA: Too many emulated memblks, failing emulation\n"); - return -EINVAL; - } - - ei->nr_blks++; - eb->start =3D pb->start; - eb->end =3D pb->start + size; - eb->nid =3D nid; - - if (emu_nid_to_phys[nid] =3D=3D NUMA_NO_NODE) - emu_nid_to_phys[nid] =3D pb->nid; - - pb->start +=3D size; - if (pb->start >=3D pb->end) { - WARN_ON_ONCE(pb->start > pb->end); - numa_remove_memblk_from(phys_blk, pi); - } - - printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", - nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); - return 0; -} - -/* - * Sets up nr_nodes fake nodes interleaved over physical nodes ranging fro= m addr - * to max_addr. - * - * Returns zero on success or negative on error. - */ -static int __init split_nodes_interleave(struct numa_meminfo *ei, - struct numa_meminfo *pi, - u64 addr, u64 max_addr, int nr_nodes) -{ - nodemask_t physnode_mask =3D numa_nodes_parsed; - u64 size; - int big; - int nid =3D 0; - int i, ret; - - if (nr_nodes <=3D 0) - return -1; - if (nr_nodes > MAX_NUMNODES) { - pr_info("numa=3Dfake=3D%d too large, reducing to %d\n", - nr_nodes, MAX_NUMNODES); - nr_nodes =3D MAX_NUMNODES; - } - - /* - * Calculate target node size. x86_32 freaks on __udivdi3() so do - * the division in ulong number of pages and convert back. - */ - size =3D max_addr - addr - mem_hole_size(addr, max_addr); - size =3D PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); - - /* - * Calculate the number of big nodes that can be allocated as a result - * of consolidating the remainder. - */ - big =3D ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / - FAKE_NODE_MIN_SIZE; - - size &=3D FAKE_NODE_MIN_HASH_MASK; - if (!size) { - pr_err("Not enough memory for each node. " - "NUMA emulation disabled.\n"); - return -1; - } - - /* - * Continue to fill physical nodes with fake nodes until there is no - * memory left on any of them. - */ - while (!nodes_empty(physnode_mask)) { - for_each_node_mask(i, physnode_mask) { - u64 dma32_end =3D PFN_PHYS(MAX_DMA32_PFN); - u64 start, limit, end; - int phys_blk; - - phys_blk =3D emu_find_memblk_by_nid(i, pi); - if (phys_blk < 0) { - node_clear(i, physnode_mask); - continue; - } - start =3D pi->blk[phys_blk].start; - limit =3D pi->blk[phys_blk].end; - end =3D start + size; - - if (nid < big) - end +=3D FAKE_NODE_MIN_SIZE; - - /* - * Continue to add memory to this fake node if its - * non-reserved memory is less than the per-node size. - */ - while (end - start - mem_hole_size(start, end) < size) { - end +=3D FAKE_NODE_MIN_SIZE; - if (end > limit) { - end =3D limit; - break; - } - } - - /* - * If there won't be at least FAKE_NODE_MIN_SIZE of - * non-reserved memory in ZONE_DMA32 for the next node, - * this one must extend to the boundary. - */ - if (end < dma32_end && dma32_end - end - - mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) - end =3D dma32_end; - - /* - * If there won't be enough non-reserved memory for the - * next node, this one must extend to the end of the - * physical node. - */ - if (limit - end - mem_hole_size(end, limit) < size) - end =3D limit; - - ret =3D emu_setup_memblk(ei, pi, nid++ % nr_nodes, - phys_blk, - min(end, limit) - start); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* - * Returns the end address of a node so that there is at least `size' amou= nt of - * non-reserved memory or `max_addr' is reached. - */ -static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) -{ - u64 end =3D start + size; - - while (end - start - mem_hole_size(start, end) < size) { - end +=3D FAKE_NODE_MIN_SIZE; - if (end > max_addr) { - end =3D max_addr; - break; - } - } - return end; -} - -static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) -{ - unsigned long max_pfn =3D PHYS_PFN(max_addr); - unsigned long base_pfn =3D PHYS_PFN(base); - unsigned long hole_pfns =3D PHYS_PFN(hole); - - return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); -} - -/* - * Sets up fake nodes of `size' interleaved over physical nodes ranging fr= om - * `addr' to `max_addr'. - * - * Returns zero on success or negative on error. - */ -static int __init split_nodes_size_interleave_uniform(struct numa_meminfo = *ei, - struct numa_meminfo *pi, - u64 addr, u64 max_addr, u64 size, - int nr_nodes, struct numa_memblk *pblk, - int nid) -{ - nodemask_t physnode_mask =3D numa_nodes_parsed; - int i, ret, uniform =3D 0; - u64 min_size; - - if ((!size && !nr_nodes) || (nr_nodes && !pblk)) - return -1; - - /* - * In the 'uniform' case split the passed in physical node by - * nr_nodes, in the non-uniform case, ignore the passed in - * physical block and try to create nodes of at least size - * @size. - * - * In the uniform case, split the nodes strictly by physical - * capacity, i.e. ignore holes. In the non-uniform case account - * for holes and treat @size as a minimum floor. - */ - if (!nr_nodes) - nr_nodes =3D MAX_NUMNODES; - else { - nodes_clear(physnode_mask); - node_set(pblk->nid, physnode_mask); - uniform =3D 1; - } - - if (uniform) { - min_size =3D uniform_size(max_addr, addr, 0, nr_nodes); - size =3D min_size; - } else { - /* - * The limit on emulated nodes is MAX_NUMNODES, so the - * size per node is increased accordingly if the - * requested size is too small. This creates a uniform - * distribution of node sizes across the entire machine - * (but not necessarily over physical nodes). - */ - min_size =3D uniform_size(max_addr, addr, - mem_hole_size(addr, max_addr), nr_nodes); - } - min_size =3D ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); - if (size < min_size) { - pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", - size >> 20, min_size >> 20); - size =3D min_size; - } - size =3D ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); - - /* - * Fill physical nodes with fake nodes of size until there is no memory - * left on any of them. - */ - while (!nodes_empty(physnode_mask)) { - for_each_node_mask(i, physnode_mask) { - u64 dma32_end =3D PFN_PHYS(MAX_DMA32_PFN); - u64 start, limit, end; - int phys_blk; - - phys_blk =3D emu_find_memblk_by_nid(i, pi); - if (phys_blk < 0) { - node_clear(i, physnode_mask); - continue; - } - - start =3D pi->blk[phys_blk].start; - limit =3D pi->blk[phys_blk].end; - - if (uniform) - end =3D start + size; - else - end =3D find_end_of_node(start, limit, size); - /* - * If there won't be at least FAKE_NODE_MIN_SIZE of - * non-reserved memory in ZONE_DMA32 for the next node, - * this one must extend to the boundary. - */ - if (end < dma32_end && dma32_end - end - - mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) - end =3D dma32_end; - - /* - * If there won't be enough non-reserved memory for the - * next node, this one must extend to the end of the - * physical node. - */ - if ((limit - end - mem_hole_size(end, limit) < size) - && !uniform) - end =3D limit; - - ret =3D emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, - phys_blk, - min(end, limit) - start); - if (ret < 0) - return ret; - } - } - return nid; -} - -static int __init split_nodes_size_interleave(struct numa_meminfo *ei, - struct numa_meminfo *pi, - u64 addr, u64 max_addr, u64 size) -{ - return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, - 0, NULL, 0); -} - -static int __init setup_emu2phys_nid(int *dfl_phys_nid) -{ - int i, max_emu_nid =3D 0; - - *dfl_phys_nid =3D NUMA_NO_NODE; - for (i =3D 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { - if (emu_nid_to_phys[i] !=3D NUMA_NO_NODE) { - max_emu_nid =3D i; - if (*dfl_phys_nid =3D=3D NUMA_NO_NODE) - *dfl_phys_nid =3D emu_nid_to_phys[i]; - } - } - - return max_emu_nid; -} - -/** - * numa_emulation - Emulate NUMA nodes - * @numa_meminfo: NUMA configuration to massage - * @numa_dist_cnt: The size of the physical NUMA distance table - * - * Emulate NUMA nodes according to the numa=3Dfake kernel parameter. - * @numa_meminfo contains the physical memory configuration and is modified - * to reflect the emulated configuration on success. @numa_dist_cnt is - * used to determine the size of the physical distance table. - * - * On success, the following modifications are made. - * - * - @numa_meminfo is updated to reflect the emulated nodes. - * - * - __apicid_to_node[] is updated such that APIC IDs are mapped to the - * emulated nodes. - * - * - NUMA distance table is rebuilt to represent distances between emulated - * nodes. The distances are determined considering how emulated nodes - * are mapped to physical nodes and match the actual distances. - * - * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical - * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). - * - * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with - * identity mapping and no other modification is made. - */ -void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dis= t_cnt) -{ - static struct numa_meminfo ei __initdata; - static struct numa_meminfo pi __initdata; - const u64 max_addr =3D PFN_PHYS(max_pfn); - u8 *phys_dist =3D NULL; - size_t phys_size =3D numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); - int max_emu_nid, dfl_phys_nid; - int i, j, ret; - - if (!emu_cmdline) - goto no_emu; - - memset(&ei, 0, sizeof(ei)); - pi =3D *numa_meminfo; - - for (i =3D 0; i < MAX_NUMNODES; i++) - emu_nid_to_phys[i] =3D NUMA_NO_NODE; - - /* - * If the numa=3Dfake command-line contains a 'M' or 'G', it represents - * the fixed node size. Otherwise, if it is just a single number N, - * split the system RAM into N fake nodes. - */ - if (strchr(emu_cmdline, 'U')) { - nodemask_t physnode_mask =3D numa_nodes_parsed; - unsigned long n; - int nid =3D 0; - - n =3D simple_strtoul(emu_cmdline, &emu_cmdline, 0); - ret =3D -1; - for_each_node_mask(i, physnode_mask) { - /* - * The reason we pass in blk[0] is due to - * numa_remove_memblk_from() called by - * emu_setup_memblk() will delete entry 0 - * and then move everything else up in the pi.blk - * array. Therefore we should always be looking - * at blk[0]. - */ - ret =3D split_nodes_size_interleave_uniform(&ei, &pi, - pi.blk[0].start, pi.blk[0].end, 0, - n, &pi.blk[0], nid); - if (ret < 0) - break; - if (ret < n) { - pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", - __func__, i, ret, n); - ret =3D -1; - break; - } - nid =3D ret; - } - } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { - u64 size; - - size =3D memparse(emu_cmdline, &emu_cmdline); - ret =3D split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); - } else { - unsigned long n; - - n =3D simple_strtoul(emu_cmdline, &emu_cmdline, 0); - ret =3D split_nodes_interleave(&ei, &pi, 0, max_addr, n); - } - if (*emu_cmdline =3D=3D ':') - emu_cmdline++; - - if (ret < 0) - goto no_emu; - - if (numa_cleanup_meminfo(&ei) < 0) { - pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation= \n"); - goto no_emu; - } - - /* copy the physical distance table */ - if (numa_dist_cnt) { - u64 phys; - - phys =3D memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0, - PFN_PHYS(max_pfn_mapped)); - if (!phys) { - pr_warn("NUMA: Warning: can't allocate copy of distance table, disablin= g emulation\n"); - goto no_emu; - } - phys_dist =3D __va(phys); - - for (i =3D 0; i < numa_dist_cnt; i++) - for (j =3D 0; j < numa_dist_cnt; j++) - phys_dist[i * numa_dist_cnt + j] =3D - node_distance(i, j); - } - - /* - * Determine the max emulated nid and the default phys nid to use - * for unmapped nodes. - */ - max_emu_nid =3D setup_emu2phys_nid(&dfl_phys_nid); - - /* commit */ - *numa_meminfo =3D ei; - - /* Make sure numa_nodes_parsed only contains emulated nodes */ - nodes_clear(numa_nodes_parsed); - for (i =3D 0; i < ARRAY_SIZE(ei.blk); i++) - if (ei.blk[i].start !=3D ei.blk[i].end && - ei.blk[i].nid !=3D NUMA_NO_NODE) - node_set(ei.blk[i].nid, numa_nodes_parsed); - - /* - * Transform __apicid_to_node table to use emulated nids by - * reverse-mapping phys_nid. The maps should always exist but fall - * back to zero just in case. - */ - for (i =3D 0; i < ARRAY_SIZE(__apicid_to_node); i++) { - if (__apicid_to_node[i] =3D=3D NUMA_NO_NODE) - continue; - for (j =3D 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) - if (__apicid_to_node[i] =3D=3D emu_nid_to_phys[j]) - break; - __apicid_to_node[i] =3D j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; - } - - /* make sure all emulated nodes are mapped to a physical node */ - for (i =3D 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) - if (emu_nid_to_phys[i] =3D=3D NUMA_NO_NODE) - emu_nid_to_phys[i] =3D dfl_phys_nid; - - /* transform distance table */ - numa_reset_distance(); - for (i =3D 0; i < max_emu_nid + 1; i++) { - for (j =3D 0; j < max_emu_nid + 1; j++) { - int physi =3D emu_nid_to_phys[i]; - int physj =3D emu_nid_to_phys[j]; - int dist; - - if (get_option(&emu_cmdline, &dist) =3D=3D 2) - ; - else if (physi >=3D numa_dist_cnt || physj >=3D numa_dist_cnt) - dist =3D physi =3D=3D physj ? - LOCAL_DISTANCE : REMOTE_DISTANCE; - else - dist =3D phys_dist[physi * numa_dist_cnt + physj]; - - numa_set_distance(i, j, dist); - } - } - - /* free the copied physical distance table */ - memblock_free(phys_dist, phys_size); - return; - -no_emu: - /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ - for (i =3D 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) - emu_nid_to_phys[i] =3D i; -} - -#ifndef CONFIG_DEBUG_PER_CPU_MAPS -void numa_add_cpu(int cpu) -{ - int physnid, nid; - - nid =3D early_cpu_to_node(cpu); - BUG_ON(nid =3D=3D NUMA_NO_NODE || !node_online(nid)); - - physnid =3D emu_nid_to_phys[nid]; - - /* - * Map the cpu to each emulated node that is allocated on the physical - * node of the cpu's apic id. - */ - for_each_online_node(nid) - if (emu_nid_to_phys[nid] =3D=3D physnid) - cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); -} - -void numa_remove_cpu(int cpu) -{ - int i; - - for_each_online_node(i) - cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); -} -#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ -static void numa_set_cpumask(int cpu, bool enable) -{ - int nid, physnid; - - nid =3D early_cpu_to_node(cpu); - if (nid =3D=3D NUMA_NO_NODE) { - /* early_cpu_to_node() already emits a warning and trace */ - return; - } - - physnid =3D emu_nid_to_phys[nid]; - - for_each_online_node(nid) { - if (emu_nid_to_phys[nid] !=3D physnid) - continue; - - debug_cpumask_set_cpu(cpu, nid, enable); - } -} - -void numa_add_cpu(int cpu) -{ - numa_set_cpumask(cpu, true); -} - -void numa_remove_cpu(int cpu) -{ - numa_set_cpumask(cpu, false); -} -#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 5df0ad5cb09d..67bdbcd0caf9 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -13,6 +13,7 @@ #include #include =20 +#include #include =20 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; @@ -30,6 +31,8 @@ static __init int numa_parse_early_param(char *opt) return -EINVAL; if (str_has_prefix(opt, "off")) numa_off =3D true; + if (!strncmp(opt, "fake=3D", 5)) + return numa_emu_cmdline(opt + 5); =20 return 0; } diff --git a/include/asm-generic/numa.h b/include/asm-generic/numa.h index 929d7c582a73..4658155a070a 100644 --- a/include/asm-generic/numa.h +++ b/include/asm-generic/numa.h @@ -50,12 +50,24 @@ struct numa_meminfo { struct numa_memblk blk[NR_NODE_MEMBLKS]; }; =20 +#ifdef CONFIG_NUMA_EMU +#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) + extern struct numa_meminfo numa_meminfo; +extern char *emu_cmdline __initdata; =20 +int numa_emu_cmdline(char *str); int __init numa_register_memblks(struct numa_meminfo *mi); int __init numa_cleanup_meminfo(struct numa_meminfo *mi); void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt); +#else +static inline int numa_emu_cmdline(char *str) +{ + return -EINVAL; +} +#endif =20 #else /* CONFIG_NUMA */ =20 diff --git a/mm/Kconfig b/mm/Kconfig index 264a2df5ecf5..22bead675ee6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -549,6 +549,14 @@ config ARCH_ENABLE_MEMORY_HOTPLUG config ARCH_ENABLE_MEMORY_HOTREMOVE bool =20 +config NUMA_EMU + bool "NUMA emulation (EXPERIMENTAL)" + depends on NUMA && (X86 || ARM64) + help + Enable NUMA emulation. A flat machine will be split + into virtual nodes when booted with "numa=3Dfake=3DN", where N is the + number of nodes. This is only useful for debugging. + # eventually, we can have this option just 'select SPARSEMEM' menuconfig MEMORY_HOTPLUG bool "Memory hotplug" diff --git a/mm/numa.c b/mm/numa.c index 88277e8404f0..3cc01f06a2a6 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -16,6 +16,10 @@ struct numa_meminfo numa_meminfo __initdata_or_meminfo; struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; =20 +#ifdef CONFIG_NUMA_EMU +char *emu_cmdline __initdata; +#endif + /* * Set nodes, which have memory in @mi, in *@nodemask. */ @@ -296,3 +300,11 @@ int __weak __init numa_register_memblks(struct numa_me= minfo *mi) =20 return 0; } + +#ifdef CONFIG_NUMA_EMU +int __init numa_emu_cmdline(char *str) +{ + emu_cmdline =3D str; + return 0; +} +#endif --=20 2.32.0.3.gf3a3e56d6 From nobody Fri Jan 2 11:57:02 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id F3CCBCDB46E for ; Thu, 12 Oct 2023 02:49:36 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1347057AbjJLCte (ORCPT ); Wed, 11 Oct 2023 22:49:34 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41086 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1376859AbjJLCs6 (ORCPT ); Wed, 11 Oct 2023 22:48:58 -0400 Received: from out30-100.freemail.mail.aliyun.com (out30-100.freemail.mail.aliyun.com [115.124.30.100]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D30D9A4 for ; Wed, 11 Oct 2023 19:48:55 -0700 (PDT) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R171e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018046059;MF=rongwei.wang@linux.alibaba.com;NM=1;PH=DS;RN=9;SR=0;TI=SMTPD_---0VtykMhN_1697078932; Received: from localhost.localdomain(mailfrom:rongwei.wang@linux.alibaba.com fp:SMTPD_---0VtykMhN_1697078932) by smtp.aliyun-inc.com; Thu, 12 Oct 2023 10:48:53 +0800 From: Rongwei Wang To: linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org Cc: akpm@linux-foundation.org, willy@infradead.org, catalin.marinas@arm.com, dave.hansen@linux.intel.com, tj@kernel.org, mingo@redhat.com Subject: [PATCH RFC 5/5] mm/numa: migrate leftover numa emulation into mm/numa.c Date: Thu, 12 Oct 2023 10:48:42 +0800 Message-Id: <20231012024842.99703-6-rongwei.wang@linux.alibaba.com> X-Mailer: git-send-email 2.40.0 In-Reply-To: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com> References: <20231012024842.99703-1-rongwei.wang@linux.alibaba.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Here moving original x86/mm/numa_emulation.c into mm/numa.c. And next to enable it for arm64. Signed-off-by: Rongwei Wang --- drivers/base/arch_numa.c | 2 + include/asm-generic/numa.h | 3 + mm/numa.c | 586 ++++++++++++++++++++++++++++++++++++- 3 files changed, 587 insertions(+), 4 deletions(-) diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 67bdbcd0caf9..c6f5ceadb9e1 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -64,6 +64,7 @@ EXPORT_SYMBOL(cpumask_of_node); =20 #endif =20 +#ifndef CONFIG_NUMA_EMU static void numa_update_cpu(unsigned int cpu, bool remove) { int nid =3D cpu_to_node(cpu); @@ -92,6 +93,7 @@ void numa_clear_node(unsigned int cpu) numa_remove_cpu(cpu); set_cpu_numa_node(cpu, NUMA_NO_NODE); } +#endif =20 /* * Allocate node_to_cpumask_map based on number of available nodes diff --git a/include/asm-generic/numa.h b/include/asm-generic/numa.h index 4658155a070a..9969ec7f59a4 100644 --- a/include/asm-generic/numa.h +++ b/include/asm-generic/numa.h @@ -55,6 +55,7 @@ struct numa_meminfo { #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) =20 extern struct numa_meminfo numa_meminfo; +extern int emu_nid_to_phys[MAX_NUMNODES]; extern char *emu_cmdline __initdata; =20 int numa_emu_cmdline(char *str); @@ -62,6 +63,8 @@ int __init numa_register_memblks(struct numa_meminfo *mi); int __init numa_cleanup_meminfo(struct numa_meminfo *mi); void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt); +int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi); #else static inline int numa_emu_cmdline(char *str) { diff --git a/mm/numa.c b/mm/numa.c index 3cc01f06a2a6..a6e9652498c9 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +/* Most of this file comes from x86/numa_emulation.c */ #include #include #include @@ -16,10 +17,6 @@ struct numa_meminfo numa_meminfo __initdata_or_meminfo; struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; =20 -#ifdef CONFIG_NUMA_EMU -char *emu_cmdline __initdata; -#endif - /* * Set nodes, which have memory in @mi, in *@nodemask. */ @@ -302,9 +299,590 @@ int __weak __init numa_register_memblks(struct numa_m= eminfo *mi) } =20 #ifdef CONFIG_NUMA_EMU +int emu_nid_to_phys[MAX_NUMNODES]; +char *emu_cmdline __initdata; + int __init numa_emu_cmdline(char *str) { emu_cmdline =3D str; return 0; } + +static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminf= o *mi) +{ + int i; + + for (i =3D 0; i < mi->nr_blks; i++) + if (mi->blk[i].nid =3D=3D nid) + return i; + return -ENOENT; +} + +static u64 __init mem_hole_size(u64 start, u64 end) +{ + unsigned long start_pfn =3D PFN_UP(start); + unsigned long end_pfn =3D PFN_DOWN(end); + + if (start_pfn < end_pfn) + return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); + return 0; +} + +/* + * Sets up nid to range from @start to @end. The return value is -errno if + * something went wrong, 0 otherwise. + */ +static int __init emu_setup_memblk(struct numa_meminfo *ei, + struct numa_meminfo *pi, + int nid, int phys_blk, u64 size) +{ + struct numa_memblk *eb =3D &ei->blk[ei->nr_blks]; + struct numa_memblk *pb =3D &pi->blk[phys_blk]; + + if (ei->nr_blks >=3D NR_NODE_MEMBLKS) { + pr_err("NUMA: Too many emulated memblks, failing emulation\n"); + return -EINVAL; + } + + ei->nr_blks++; + eb->start =3D pb->start; + eb->end =3D pb->start + size; + eb->nid =3D nid; + + if (emu_nid_to_phys[nid] =3D=3D NUMA_NO_NODE) + emu_nid_to_phys[nid] =3D pb->nid; + + pb->start +=3D size; + if (pb->start >=3D pb->end) { + WARN_ON_ONCE(pb->start > pb->end); + numa_remove_memblk_from(phys_blk, pi); + } + + printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", + nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); + return 0; +} + +/* + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging fro= m addr + * to max_addr. + * + * Returns zero on success or negative on error. + */ +static int __init split_nodes_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, int nr_nodes) +{ + nodemask_t physnode_mask =3D numa_nodes_parsed; + u64 size; + int big; + int nid =3D 0; + int i, ret; + + if (nr_nodes <=3D 0) + return -1; + if (nr_nodes > MAX_NUMNODES) { + pr_info("numa=3Dfake=3D%d too large, reducing to %d\n", + nr_nodes, MAX_NUMNODES); + nr_nodes =3D MAX_NUMNODES; + } + + /* + * Calculate target node size. x86_32 freaks on __udivdi3() so do + * the division in ulong number of pages and convert back. + */ + size =3D max_addr - addr - mem_hole_size(addr, max_addr); + size =3D PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); + + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the remainder. + */ + big =3D ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / + FAKE_NODE_MIN_SIZE; + + size &=3D FAKE_NODE_MIN_HASH_MASK; + if (!size) { + pr_err("Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; + } + + /* + * Continue to fill physical nodes with fake nodes until there is no + * memory left on any of them. + */ + while (!nodes_empty(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { +#ifdef CONFIG_X86 + u64 dma32_end =3D PFN_PHYS(MAX_DMA32_PFN); +#endif + u64 start, limit, end; + int phys_blk; + + phys_blk =3D emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start =3D pi->blk[phys_blk].start; + limit =3D pi->blk[phys_blk].end; + end =3D start + size; + + if (nid < big) + end +=3D FAKE_NODE_MIN_SIZE; + + /* + * Continue to add memory to this fake node if its + * non-reserved memory is less than the per-node size. + */ + while (end - start - mem_hole_size(start, end) < size) { + end +=3D FAKE_NODE_MIN_SIZE; + if (end > limit) { + end =3D limit; + break; + } + } + +#ifdef CONFIG_X86 + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end =3D dma32_end; +#endif + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (limit - end - mem_hole_size(end, limit) < size) + end =3D limit; + + ret =3D emu_setup_memblk(ei, pi, nid++ % nr_nodes, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* + * Returns the end address of a node so that there is at least `size' amou= nt of + * non-reserved memory or `max_addr' is reached. + */ +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) +{ + u64 end =3D start + size; + + while (end - start - mem_hole_size(start, end) < size) { + end +=3D FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end =3D max_addr; + break; + } + } + return end; +} + +static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) +{ + unsigned long max_pfn =3D PHYS_PFN(max_addr); + unsigned long base_pfn =3D PHYS_PFN(base); + unsigned long hole_pfns =3D PHYS_PFN(hole); + + return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); +} + +/* + * Sets up fake nodes of `size' interleaved over physical nodes ranging fr= om + * `addr' to `max_addr'. + * + * Returns zero on success or negative on error. + */ +static int __init split_nodes_size_interleave_uniform(struct numa_meminfo = *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size, + int nr_nodes, struct numa_memblk *pblk, + int nid) +{ + nodemask_t physnode_mask =3D numa_nodes_parsed; + int i, ret, uniform =3D 0; + u64 min_size; + + if ((!size && !nr_nodes) || (nr_nodes && !pblk)) + return -1; + + /* + * In the 'uniform' case split the passed in physical node by + * nr_nodes, in the non-uniform case, ignore the passed in + * physical block and try to create nodes of at least size + * @size. + * + * In the uniform case, split the nodes strictly by physical + * capacity, i.e. ignore holes. In the non-uniform case account + * for holes and treat @size as a minimum floor. + */ + if (!nr_nodes) + nr_nodes =3D MAX_NUMNODES; + else { + nodes_clear(physnode_mask); + node_set(pblk->nid, physnode_mask); + uniform =3D 1; + } + + if (uniform) { + min_size =3D uniform_size(max_addr, addr, 0, nr_nodes); + size =3D min_size; + } else { + /* + * The limit on emulated nodes is MAX_NUMNODES, so the + * size per node is increased accordingly if the + * requested size is too small. This creates a uniform + * distribution of node sizes across the entire machine + * (but not necessarily over physical nodes). + */ + min_size =3D uniform_size(max_addr, addr, + mem_hole_size(addr, max_addr), nr_nodes); + } + min_size =3D ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size =3D min_size; + } + size =3D ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); + + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (!nodes_empty(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { +#ifdef CONFIG_X86 + u64 dma32_end =3D PFN_PHYS(MAX_DMA32_PFN); +#endif + u64 start, limit, end; + int phys_blk; + + phys_blk =3D emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + + start =3D pi->blk[phys_blk].start; + limit =3D pi->blk[phys_blk].end; + + if (uniform) + end =3D start + size; + else + end =3D find_end_of_node(start, limit, size); + +#ifdef CONFIG_X86 + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end =3D dma32_end; +#endif + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if ((limit - end - mem_hole_size(end, limit) < size) + && !uniform) + end =3D limit; + + ret =3D emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return nid; +} + +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size) +{ + return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, + 0, NULL, 0); +} + +static int __init setup_emu2phys_nid(int *dfl_phys_nid) +{ + int i, max_emu_nid =3D 0; + + *dfl_phys_nid =3D NUMA_NO_NODE; + for (i =3D 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] !=3D NUMA_NO_NODE) { + max_emu_nid =3D i; + if (*dfl_phys_nid =3D=3D NUMA_NO_NODE) + *dfl_phys_nid =3D emu_nid_to_phys[i]; + } + } + + return max_emu_nid; +} + +/** + * numa_emulation - Emulate NUMA nodes + * @numa_meminfo: NUMA configuration to massage + * @numa_dist_cnt: The size of the physical NUMA distance table + * + * Emulate NUMA nodes according to the numa=3Dfake kernel parameter. + * @numa_meminfo contains the physical memory configuration and is modified + * to reflect the emulated configuration on success. @numa_dist_cnt is + * used to determine the size of the physical distance table. + * + * On success, the following modifications are made. + * + * - @numa_meminfo is updated to reflect the emulated nodes. + * + * - __apicid_to_node[] is updated such that APIC IDs are mapped to the + * emulated nodes. + * + * - NUMA distance table is rebuilt to represent distances between emulated + * nodes. The distances are determined considering how emulated nodes + * are mapped to physical nodes and match the actual distances. + * + * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical + * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). + * + * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with + * identity mapping and no other modification is made. + */ +void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dis= t_cnt) +{ + static struct numa_meminfo ei __initdata; + static struct numa_meminfo pi __initdata; + const u64 max_addr =3D PFN_PHYS(max_pfn); + u8 *phys_dist =3D NULL; + size_t phys_size =3D numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); + int max_emu_nid, dfl_phys_nid; + int i, j, ret; + + if (!emu_cmdline) + goto no_emu; + + memset(&ei, 0, sizeof(ei)); + pi =3D *numa_meminfo; + + for (i =3D 0; i < MAX_NUMNODES; i++) + emu_nid_to_phys[i] =3D NUMA_NO_NODE; + + /* + * If the numa=3Dfake command-line contains a 'M' or 'G', it represents + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. + */ + if (strchr(emu_cmdline, 'U')) { + nodemask_t physnode_mask =3D numa_nodes_parsed; + unsigned long n; + int nid =3D 0; + + n =3D simple_strtoul(emu_cmdline, &emu_cmdline, 0); + ret =3D -1; + for_each_node_mask(i, physnode_mask) { + /* + * The reason we pass in blk[0] is due to + * numa_remove_memblk_from() called by + * emu_setup_memblk() will delete entry 0 + * and then move everything else up in the pi.blk + * array. Therefore we should always be looking + * at blk[0]. + */ + ret =3D split_nodes_size_interleave_uniform(&ei, &pi, + pi.blk[0].start, pi.blk[0].end, 0, + n, &pi.blk[0], nid); + if (ret < 0) + break; + if (ret < n) { + pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", + __func__, i, ret, n); + ret =3D -1; + break; + } + nid =3D ret; + } + } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { + u64 size; + + size =3D memparse(emu_cmdline, &emu_cmdline); + ret =3D split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); + } else { + unsigned long n; + + n =3D simple_strtoul(emu_cmdline, &emu_cmdline, 0); + ret =3D split_nodes_interleave(&ei, &pi, 0, max_addr, n); + } + if (*emu_cmdline =3D=3D ':') + emu_cmdline++; + + if (ret < 0) + goto no_emu; + + if (numa_cleanup_meminfo(&ei) < 0) { + pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation= \n"); + goto no_emu; + } + + /* copy the physical distance table */ + if (numa_dist_cnt) { + u64 phys; + + phys =3D memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0, + MEMBLOCK_ALLOC_ACCESSIBLE); + if (!phys) { + pr_warn("NUMA: Warning: can't allocate copy of distance table, disablin= g emulation\n"); + goto no_emu; + } + phys_dist =3D __va(phys); + + for (i =3D 0; i < numa_dist_cnt; i++) + for (j =3D 0; j < numa_dist_cnt; j++) + phys_dist[i * numa_dist_cnt + j] =3D + node_distance(i, j); + } + + /* + * Determine the max emulated nid and the default phys nid to use + * for unmapped nodes. + */ + max_emu_nid =3D setup_emu2phys_nid(&dfl_phys_nid); + + /* commit */ + *numa_meminfo =3D ei; + + /* Make sure numa_nodes_parsed only contains emulated nodes */ + nodes_clear(numa_nodes_parsed); + for (i =3D 0; i < ARRAY_SIZE(ei.blk); i++) + if (ei.blk[i].start !=3D ei.blk[i].end && + ei.blk[i].nid !=3D NUMA_NO_NODE) + node_set(ei.blk[i].nid, numa_nodes_parsed); + +#ifdef CONFIG_X86 + /* + * Transform __apicid_to_node table to use emulated nids by + * reverse-mapping phys_nid. The maps should always exist but fall + * back to zero just in case. + */ + for (i =3D 0; i < ARRAY_SIZE(__apicid_to_node); i++) { + if (__apicid_to_node[i] =3D=3D NUMA_NO_NODE) + continue; + for (j =3D 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) + if (__apicid_to_node[i] =3D=3D emu_nid_to_phys[j]) + break; + __apicid_to_node[i] =3D j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; + } +#endif + + /* make sure all emulated nodes are mapped to a physical node */ + for (i =3D 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + if (emu_nid_to_phys[i] =3D=3D NUMA_NO_NODE) + emu_nid_to_phys[i] =3D dfl_phys_nid; + + /* transform distance table */ + numa_free_distance(); + for (i =3D 0; i < max_emu_nid + 1; i++) { + for (j =3D 0; j < max_emu_nid + 1; j++) { + int physi =3D emu_nid_to_phys[i]; + int physj =3D emu_nid_to_phys[j]; + int dist; + + if (get_option(&emu_cmdline, &dist) =3D=3D 2) + ; + else if (physi >=3D numa_dist_cnt || physj >=3D numa_dist_cnt) + dist =3D physi =3D=3D physj ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + else + dist =3D phys_dist[physi * numa_dist_cnt + physj]; + + numa_set_distance(i, j, dist); + } + } + + /* free the copied physical distance table */ + memblock_free(phys_dist, phys_size); + return; + +no_emu: + /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ + for (i =3D 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + emu_nid_to_phys[i] =3D i; +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS +extern int early_cpu_to_node(unsigned int cpu); + +void numa_add_cpu(unsigned int cpu) +{ + int physnid, nid; + + nid =3D early_cpu_to_node(cpu); + BUG_ON(nid =3D=3D NUMA_NO_NODE || !node_online(nid)); + + physnid =3D emu_nid_to_phys[nid]; + + /* + * Map the cpu to each emulated node that is allocated on the physical + * node of the cpu's apic id. + */ + for_each_online_node(nid) + if (emu_nid_to_phys[nid] =3D=3D physnid) + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); +} + +void numa_remove_cpu(unsigned int cpu) +{ + int i; + + for_each_online_node(i) + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); +} +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ +static void numa_set_cpumask(int cpu, bool enable) +{ + int nid, physnid; + + nid =3D early_cpu_to_node(cpu); + if (nid =3D=3D NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } + + physnid =3D emu_nid_to_phys[nid]; + + for_each_online_node(nid) { + if (emu_nid_to_phys[nid] !=3D physnid) + continue; + + debug_cpumask_set_cpu(cpu, nid, enable); + } +} + +void numa_add_cpu(unsigned int cpu) +{ + numa_set_cpumask(cpu, true); +} + +void numa_remove_cpu(unsigned int cpu) +{ + numa_set_cpumask(cpu, false); +} +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ #endif --=20 2.32.0.3.gf3a3e56d6