From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Architectures that support NUMA duplicate the code that allocates
NODE_DATA on the node-local memory with slight variations in reporting
of the addresses where the memory was allocated.
Use x86 version as the basis for the generic alloc_node_data() function
and call this function in architecture specific numa initialization.
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
arch/loongarch/kernel/numa.c | 18 ------------------
arch/mips/loongson64/numa.c | 16 ++--------------
arch/powerpc/mm/numa.c | 24 +++---------------------
arch/sh/mm/init.c | 7 +------
arch/sparc/mm/init_64.c | 9 ++-------
arch/x86/mm/numa.c | 34 +---------------------------------
drivers/base/arch_numa.c | 21 +--------------------
include/linux/numa.h | 2 ++
mm/numa.c | 27 +++++++++++++++++++++++++++
9 files changed, 39 insertions(+), 119 deletions(-)
diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c
index acada671e020..84fe7f854820 100644
--- a/arch/loongarch/kernel/numa.c
+++ b/arch/loongarch/kernel/numa.c
@@ -187,24 +187,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
return numa_add_memblk_to(nid, start, end, &numa_meminfo);
}
-static void __init alloc_node_data(int nid)
-{
- void *nd;
- unsigned long nd_pa;
- size_t nd_sz = roundup(sizeof(pg_data_t), PAGE_SIZE);
-
- nd_pa = memblock_phys_alloc_try_nid(nd_sz, SMP_CACHE_BYTES, nid);
- if (!nd_pa) {
- pr_err("Cannot find %zu Byte for node_data (initial node: %d)\n", nd_sz, nid);
- return;
- }
-
- nd = __va(nd_pa);
-
- node_data[nid] = nd;
- memset(nd, 0, sizeof(pg_data_t));
-}
-
static void __init node_mem_init(unsigned int node)
{
unsigned long start_pfn, end_pfn;
diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c
index 9208eaadf690..909f6cec3a26 100644
--- a/arch/mips/loongson64/numa.c
+++ b/arch/mips/loongson64/numa.c
@@ -81,12 +81,8 @@ static void __init init_topology_matrix(void)
static void __init node_mem_init(unsigned int node)
{
- struct pglist_data *nd;
unsigned long node_addrspace_offset;
unsigned long start_pfn, end_pfn;
- unsigned long nd_pa;
- int tnid;
- const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
node_addrspace_offset = nid_to_addrbase(node);
pr_info("Node%d's addrspace_offset is 0x%lx\n",
@@ -96,16 +92,8 @@ static void __init node_mem_init(unsigned int node)
pr_info("Node%d: start_pfn=0x%lx, end_pfn=0x%lx\n",
node, start_pfn, end_pfn);
- nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, node);
- if (!nd_pa)
- panic("Cannot allocate %zu bytes for node %d data\n",
- nd_size, node);
- nd = __va(nd_pa);
- memset(nd, 0, sizeof(struct pglist_data));
- tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
- if (tnid != node)
- pr_info("NODE_DATA(%d) on node %d\n", node, tnid);
- node_data[node] = nd;
+ alloc_node_data(node);
+
NODE_DATA(node)->node_start_pfn = start_pfn;
NODE_DATA(node)->node_spanned_pages = end_pfn - start_pfn;
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 8c18973cd71e..4c54764af160 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1081,27 +1081,9 @@ void __init dump_numa_cpu_topology(void)
static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
{
u64 spanned_pages = end_pfn - start_pfn;
- const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
- u64 nd_pa;
- void *nd;
- int tnid;
-
- nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
- if (!nd_pa)
- panic("Cannot allocate %zu bytes for node %d data\n",
- nd_size, nid);
-
- nd = __va(nd_pa);
-
- /* report and initialize */
- pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n",
- nd_pa, nd_pa + nd_size - 1);
- tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
- if (tnid != nid)
- pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
-
- node_data[nid] = nd;
- memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+
+ alloc_node_data(nid);
+
NODE_DATA(nid)->node_id = nid;
NODE_DATA(nid)->node_start_pfn = start_pfn;
NODE_DATA(nid)->node_spanned_pages = spanned_pages;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index bf1b54055316..5cc89a0932c3 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -212,12 +212,7 @@ void __init allocate_pgdat(unsigned int nid)
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
#ifdef CONFIG_NUMA
- NODE_DATA(nid) = memblock_alloc_try_nid(
- sizeof(struct pglist_data),
- SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
- MEMBLOCK_ALLOC_ACCESSIBLE, nid);
- if (!NODE_DATA(nid))
- panic("Can't allocate pgdat for node %d\n", nid);
+ alloc_node_data(nid);
#endif
NODE_DATA(nid)->node_start_pfn = start_pfn;
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 3cb698204609..83279c43572d 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1075,14 +1075,9 @@ static void __init allocate_node_data(int nid)
{
struct pglist_data *p;
unsigned long start_pfn, end_pfn;
-#ifdef CONFIG_NUMA
- NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data),
- SMP_CACHE_BYTES, nid);
- if (!NODE_DATA(nid)) {
- prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid);
- prom_halt();
- }
+#ifdef CONFIG_NUMA
+ alloc_node_data(nid);
NODE_DATA(nid)->node_id = nid;
#endif
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 7de725d6bb05..5e1dde26674b 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -191,39 +191,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
return numa_add_memblk_to(nid, start, end, &numa_meminfo);
}
-/* Allocate NODE_DATA for a node on the local memory */
-static void __init alloc_node_data(int nid)
-{
- const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- u64 nd_pa;
- void *nd;
- int tnid;
-
- /*
- * Allocate node data. Try node-local memory and then any node.
- * Never allocate in DMA zone.
- */
- nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
- if (!nd_pa) {
- pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
- nd_size, nid);
- return;
- }
- nd = __va(nd_pa);
-
- /* report and initialize */
- printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
- nd_pa, nd_pa + nd_size - 1);
- tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
- if (tnid != nid)
- printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
-
- node_data[nid] = nd;
- memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-
- node_set_online(nid);
-}
-
/**
* numa_cleanup_meminfo - Cleanup a numa_meminfo
* @mi: numa_meminfo to clean up
@@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
continue;
alloc_node_data(nid);
+ node_set_online(nid);
}
/* Dump memblock with node info and return. */
diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index 9b71ad2869f1..2ebf12eab99f 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -216,30 +216,11 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
*/
static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
{
- const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
- u64 nd_pa;
- void *nd;
- int tnid;
-
if (start_pfn >= end_pfn)
pr_info("Initmem setup node %d [<memory-less node>]\n", nid);
- nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
- if (!nd_pa)
- panic("Cannot allocate %zu bytes for node %d data\n",
- nd_size, nid);
-
- nd = __va(nd_pa);
-
- /* report and initialize */
- pr_info("NODE_DATA [mem %#010Lx-%#010Lx]\n",
- nd_pa, nd_pa + nd_size - 1);
- tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
- if (tnid != nid)
- pr_info("NODE_DATA(%d) on node %d\n", nid, tnid);
+ alloc_node_data(nid);
- node_data[nid] = nd;
- memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
NODE_DATA(nid)->node_id = nid;
NODE_DATA(nid)->node_start_pfn = start_pfn;
NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
diff --git a/include/linux/numa.h b/include/linux/numa.h
index e5841d4057ab..3b12d8ca0afd 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -33,6 +33,8 @@ static inline bool numa_valid_node(int nid)
extern struct pglist_data *node_data[];
#define NODE_DATA(nid) (node_data[nid])
+void __init alloc_node_data(int nid);
+
/* Generic implementation available */
int numa_nearest_node(int node, unsigned int state);
diff --git a/mm/numa.c b/mm/numa.c
index 8c157d41c026..0483cabc4c4b 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -1,11 +1,38 @@
// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/memblock.h>
#include <linux/printk.h>
#include <linux/numa.h>
struct pglist_data *node_data[MAX_NUMNODES];
EXPORT_SYMBOL(node_data);
+/* Allocate NODE_DATA for a node on the local memory */
+void __init alloc_node_data(int nid)
+{
+ const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+ u64 nd_pa;
+ void *nd;
+ int tnid;
+
+ /* Allocate node data. Try node-local memory and then any node. */
+ nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
+ if (!nd_pa)
+ panic("Cannot allocate %zu bytes for node %d data\n",
+ nd_size, nid);
+ nd = __va(nd_pa);
+
+ /* report and initialize */
+ pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
+ nd_pa, nd_pa + nd_size - 1);
+ tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+ if (tnid != nid)
+ pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
+
+ node_data[nid] = nd;
+ memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+}
+
/* Stub functions: */
#ifndef memory_add_physaddr_to_nid
--
2.43.0
On Tue, 16 Jul 2024 14:13:34 +0300 Mike Rapoport <rppt@kernel.org> wrote: > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org> > > Architectures that support NUMA duplicate the code that allocates > NODE_DATA on the node-local memory with slight variations in reporting > of the addresses where the memory was allocated. > > Use x86 version as the basis for the generic alloc_node_data() function > and call this function in architecture specific numa initialization. > > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> I've no idea what rules are for the sparc prom_printf() calls but given that file already has mix and match of those and normal prints in single functions I assume this change is fine and we'll just see the prints a bit later. Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
On 16.07.24 13:13, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
> Architectures that support NUMA duplicate the code that allocates
> NODE_DATA on the node-local memory with slight variations in reporting
> of the addresses where the memory was allocated.
>
> Use x86 version as the basis for the generic alloc_node_data() function
> and call this function in architecture specific numa initialization.
>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
[...]
> diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c
> index 9208eaadf690..909f6cec3a26 100644
> --- a/arch/mips/loongson64/numa.c
> +++ b/arch/mips/loongson64/numa.c
> @@ -81,12 +81,8 @@ static void __init init_topology_matrix(void)
>
> static void __init node_mem_init(unsigned int node)
> {
> - struct pglist_data *nd;
> unsigned long node_addrspace_offset;
> unsigned long start_pfn, end_pfn;
> - unsigned long nd_pa;
> - int tnid;
> - const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
One interesting change is that we now always round up to full pages on
architectures where we previously rounded up to SMP_CACHE_BYTES.
I assume we don't really expect a significant growth in memory
consumption that we care about, especially because most systems with
many nodes also have quite some memory around.
> -/* Allocate NODE_DATA for a node on the local memory */
> -static void __init alloc_node_data(int nid)
> -{
> - const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
> - u64 nd_pa;
> - void *nd;
> - int tnid;
> -
> - /*
> - * Allocate node data. Try node-local memory and then any node.
> - * Never allocate in DMA zone.
> - */
> - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
> - if (!nd_pa) {
> - pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
> - nd_size, nid);
> - return;
> - }
> - nd = __va(nd_pa);
> -
> - /* report and initialize */
> - printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
> - nd_pa, nd_pa + nd_size - 1);
> - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
> - if (tnid != nid)
> - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
> -
> - node_data[nid] = nd;
> - memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> -
> - node_set_online(nid);
> -}
> -
> /**
> * numa_cleanup_meminfo - Cleanup a numa_meminfo
> * @mi: numa_meminfo to clean up
> @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> continue;
>
> alloc_node_data(nid);
> + node_set_online(nid);
> }
I can spot that we only remove a single node_set_online() call from x86.
What about all the other architectures? Will there be any change in
behavior for them? Or do we simply set the nodes online later once more?
--
Cheers,
David / dhildenb
On Wed, Jul 17, 2024 at 04:42:48PM +0200, David Hildenbrand wrote:
> On 16.07.24 13:13, Mike Rapoport wrote:
> > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> >
> > Architectures that support NUMA duplicate the code that allocates
> > NODE_DATA on the node-local memory with slight variations in reporting
> > of the addresses where the memory was allocated.
> >
> > Use x86 version as the basis for the generic alloc_node_data() function
> > and call this function in architecture specific numa initialization.
> >
> > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> > ---
>
> [...]
>
> > diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c
> > index 9208eaadf690..909f6cec3a26 100644
> > --- a/arch/mips/loongson64/numa.c
> > +++ b/arch/mips/loongson64/numa.c
> > @@ -81,12 +81,8 @@ static void __init init_topology_matrix(void)
> > static void __init node_mem_init(unsigned int node)
> > {
> > - struct pglist_data *nd;
> > unsigned long node_addrspace_offset;
> > unsigned long start_pfn, end_pfn;
> > - unsigned long nd_pa;
> > - int tnid;
> > - const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
>
> One interesting change is that we now always round up to full pages on
> architectures where we previously rounded up to SMP_CACHE_BYTES.
I did some git archaeology and it seems that round up to full pages on x86
backdates to bootmem era when allocation granularity was PAGE_SIZE anyway.
I'm going to change that to SMP_CACHE_BYTES in v2.
> I assume we don't really expect a significant growth in memory consumption
> that we care about, especially because most systems with many nodes also
> have quite some memory around.
--
Sincerely yours,
Mike.
On Wed, Jul 17, 2024 at 04:42:48PM +0200, David Hildenbrand wrote:
> On 16.07.24 13:13, Mike Rapoport wrote:
> > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> >
> > Architectures that support NUMA duplicate the code that allocates
> > NODE_DATA on the node-local memory with slight variations in reporting
> > of the addresses where the memory was allocated.
> >
> > Use x86 version as the basis for the generic alloc_node_data() function
> > and call this function in architecture specific numa initialization.
> >
> > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> > ---
>
> [...]
>
> > diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c
> > index 9208eaadf690..909f6cec3a26 100644
> > --- a/arch/mips/loongson64/numa.c
> > +++ b/arch/mips/loongson64/numa.c
> > @@ -81,12 +81,8 @@ static void __init init_topology_matrix(void)
> > static void __init node_mem_init(unsigned int node)
> > {
> > - struct pglist_data *nd;
> > unsigned long node_addrspace_offset;
> > unsigned long start_pfn, end_pfn;
> > - unsigned long nd_pa;
> > - int tnid;
> > - const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
>
> One interesting change is that we now always round up to full pages on
> architectures where we previously rounded up to SMP_CACHE_BYTES.
On my workstation struct pglist_data take 174400, cachelines: 2725, members: 43 */
> I assume we don't really expect a significant growth in memory consumption
> that we care about, especially because most systems with many nodes also
> have quite some memory around.
With Debian kernel configuration for 6.5 struct pglist data takes 174400
bytes so the increase here is below 1%.
For NUMA systems with a lot of nodes that shouldn't be a problem.
> > -/* Allocate NODE_DATA for a node on the local memory */
> > -static void __init alloc_node_data(int nid)
> > -{
> > - const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
> > - u64 nd_pa;
> > - void *nd;
> > - int tnid;
> > -
> > - /*
> > - * Allocate node data. Try node-local memory and then any node.
> > - * Never allocate in DMA zone.
> > - */
> > - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
> > - if (!nd_pa) {
> > - pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
> > - nd_size, nid);
> > - return;
> > - }
> > - nd = __va(nd_pa);
> > -
> > - /* report and initialize */
> > - printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
> > - nd_pa, nd_pa + nd_size - 1);
> > - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
> > - if (tnid != nid)
> > - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
> > -
> > - node_data[nid] = nd;
> > - memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> > -
> > - node_set_online(nid);
> > -}
> > -
> > /**
> > * numa_cleanup_meminfo - Cleanup a numa_meminfo
> > * @mi: numa_meminfo to clean up
> > @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > continue;
> > alloc_node_data(nid);
> > + node_set_online(nid);
> > }
>
> I can spot that we only remove a single node_set_online() call from x86.
>
> What about all the other architectures? Will there be any change in behavior
> for them? Or do we simply set the nodes online later once more?
On x86 node_set_online() was a part of alloc_node_data() and I moved it
outside so it's called right after alloc_node_data(). On other
architectures the allocation didn't include that call, so there should be
no difference there.
> --
> Cheers,
>
> David / dhildenb
>
>
--
Sincerely yours,
Mike.
>>> - * Allocate node data. Try node-local memory and then any node.
>>> - * Never allocate in DMA zone.
>>> - */
>>> - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
>>> - if (!nd_pa) {
>>> - pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
>>> - nd_size, nid);
>>> - return;
>>> - }
>>> - nd = __va(nd_pa);
>>> -
>>> - /* report and initialize */
>>> - printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
>>> - nd_pa, nd_pa + nd_size - 1);
>>> - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
>>> - if (tnid != nid)
>>> - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
>>> -
>>> - node_data[nid] = nd;
>>> - memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
>>> -
>>> - node_set_online(nid);
>>> -}
>>> -
>>> /**
>>> * numa_cleanup_meminfo - Cleanup a numa_meminfo
>>> * @mi: numa_meminfo to clean up
>>> @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
>>> continue;
>>> alloc_node_data(nid);
>>> + node_set_online(nid);
>>> }
>>
>> I can spot that we only remove a single node_set_online() call from x86.
>>
>> What about all the other architectures? Will there be any change in behavior
>> for them? Or do we simply set the nodes online later once more?
>
> On x86 node_set_online() was a part of alloc_node_data() and I moved it
> outside so it's called right after alloc_node_data(). On other
> architectures the allocation didn't include that call, so there should be
> no difference there.
But won't their arch code try setting the nodes online at a later stage?
And I think, some architectures only set nodes online conditionally
(see most other node_set_online() calls).
Sorry if I'm confused here, but with now unconditional node_set_online(), won't
we change the behavior of other architectures?
--
Cheers,
David / dhildenb
On Fri, 19 Jul 2024 17:07:35 +0200
David Hildenbrand <david@redhat.com> wrote:
> >>> - * Allocate node data. Try node-local memory and then any node.
> >>> - * Never allocate in DMA zone.
> >>> - */
> >>> - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
> >>> - if (!nd_pa) {
> >>> - pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
> >>> - nd_size, nid);
> >>> - return;
> >>> - }
> >>> - nd = __va(nd_pa);
> >>> -
> >>> - /* report and initialize */
> >>> - printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
> >>> - nd_pa, nd_pa + nd_size - 1);
> >>> - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
> >>> - if (tnid != nid)
> >>> - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
> >>> -
> >>> - node_data[nid] = nd;
> >>> - memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> >>> -
> >>> - node_set_online(nid);
> >>> -}
> >>> -
> >>> /**
> >>> * numa_cleanup_meminfo - Cleanup a numa_meminfo
> >>> * @mi: numa_meminfo to clean up
> >>> @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> >>> continue;
> >>> alloc_node_data(nid);
> >>> + node_set_online(nid);
> >>> }
> >>
> >> I can spot that we only remove a single node_set_online() call from x86.
> >>
> >> What about all the other architectures? Will there be any change in behavior
> >> for them? Or do we simply set the nodes online later once more?
> >
> > On x86 node_set_online() was a part of alloc_node_data() and I moved it
> > outside so it's called right after alloc_node_data(). On other
> > architectures the allocation didn't include that call, so there should be
> > no difference there.
>
> But won't their arch code try setting the nodes online at a later stage?
>
> And I think, some architectures only set nodes online conditionally
> (see most other node_set_online() calls).
>
> Sorry if I'm confused here, but with now unconditional node_set_online(), won't
> we change the behavior of other architectures?
This is moving x86 code to x86 code, not a generic location
so how would that affect anyone else? Their onlining should be same as
before.
The node onlining difference are a pain (I recall that fun from adding
generic initiators) as different ordering on x86 and arm64 at least.
Jonathan
>
On 19.07.24 17:51, Jonathan Cameron wrote:
> On Fri, 19 Jul 2024 17:07:35 +0200
> David Hildenbrand <david@redhat.com> wrote:
>
>>>>> - * Allocate node data. Try node-local memory and then any node.
>>>>> - * Never allocate in DMA zone.
>>>>> - */
>>>>> - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
>>>>> - if (!nd_pa) {
>>>>> - pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
>>>>> - nd_size, nid);
>>>>> - return;
>>>>> - }
>>>>> - nd = __va(nd_pa);
>>>>> -
>>>>> - /* report and initialize */
>>>>> - printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
>>>>> - nd_pa, nd_pa + nd_size - 1);
>>>>> - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
>>>>> - if (tnid != nid)
>>>>> - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
>>>>> -
>>>>> - node_data[nid] = nd;
>>>>> - memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
>>>>> -
>>>>> - node_set_online(nid);
>>>>> -}
>>>>> -
>>>>> /**
>>>>> * numa_cleanup_meminfo - Cleanup a numa_meminfo
>>>>> * @mi: numa_meminfo to clean up
>>>>> @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
>>>>> continue;
>>>>> alloc_node_data(nid);
>>>>> + node_set_online(nid);
>>>>> }
>>>>
>>>> I can spot that we only remove a single node_set_online() call from x86.
>>>>
>>>> What about all the other architectures? Will there be any change in behavior
>>>> for them? Or do we simply set the nodes online later once more?
>>>
>>> On x86 node_set_online() was a part of alloc_node_data() and I moved it
>>> outside so it's called right after alloc_node_data(). On other
>>> architectures the allocation didn't include that call, so there should be
>>> no difference there.
>>
>> But won't their arch code try setting the nodes online at a later stage?
>>
>> And I think, some architectures only set nodes online conditionally
>> (see most other node_set_online() calls).
>>
>> Sorry if I'm confused here, but with now unconditional node_set_online(), won't
>> we change the behavior of other architectures?
> This is moving x86 code to x86 code, not a generic location
> so how would that affect anyone else? Their onlining should be same as
> before.
Yes, see my reply to Mike.
>
> The node onlining difference are a pain (I recall that fun from adding
> generic initiators) as different ordering on x86 and arm64 at least.
That's part of the reason I was confused, because I remember some nasty
inconsistency.
--
Cheers,
David / dhildenb
On Fri, Jul 19, 2024 at 05:07:35PM +0200, David Hildenbrand wrote:
> > > > - * Allocate node data. Try node-local memory and then any node.
> > > > - * Never allocate in DMA zone.
> > > > - */
> > > > - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
> > > > - if (!nd_pa) {
> > > > - pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
> > > > - nd_size, nid);
> > > > - return;
> > > > - }
> > > > - nd = __va(nd_pa);
> > > > -
> > > > - /* report and initialize */
> > > > - printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
> > > > - nd_pa, nd_pa + nd_size - 1);
> > > > - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
> > > > - if (tnid != nid)
> > > > - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
> > > > -
> > > > - node_data[nid] = nd;
> > > > - memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> > > > -
> > > > - node_set_online(nid);
> > > > -}
> > > > -
> > > > /**
> > > > * numa_cleanup_meminfo - Cleanup a numa_meminfo
> > > > * @mi: numa_meminfo to clean up
> > > > @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > > > continue;
> > > > alloc_node_data(nid);
> > > > + node_set_online(nid);
> > > > }
> > >
> > > I can spot that we only remove a single node_set_online() call from x86.
> > >
> > > What about all the other architectures? Will there be any change in behavior
> > > for them? Or do we simply set the nodes online later once more?
> >
> > On x86 node_set_online() was a part of alloc_node_data() and I moved it
> > outside so it's called right after alloc_node_data(). On other
> > architectures the allocation didn't include that call, so there should be
> > no difference there.
>
> But won't their arch code try setting the nodes online at a later stage?
>
> And I think, some architectures only set nodes online conditionally
> (see most other node_set_online() calls).
>
> Sorry if I'm confused here, but with now unconditional node_set_online(), won't
> we change the behavior of other architectures?
The generic alloc_node_data() does not set the node online:
+/* Allocate NODE_DATA for a node on the local memory */
+void __init alloc_node_data(int nid)
+{
+ const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+ u64 nd_pa;
+ void *nd;
+ int tnid;
+
+ /* Allocate node data. Try node-local memory and then any node. */
+ nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
+ if (!nd_pa)
+ panic("Cannot allocate %zu bytes for node %d data\n",
+ nd_size, nid);
+ nd = __va(nd_pa);
+
+ /* report and initialize */
+ pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
+ nd_pa, nd_pa + nd_size - 1);
+ tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+ if (tnid != nid)
+ pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
+
+ node_data[nid] = nd;
+ memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+}
I might have missed some architecture except x86 that calls
node_set_online() in its alloc_node_data(), but the intention was to leave
that call outside the alloc and explicitly add it after the call to
alloc_node_data() if needed like in x86.
> --
> Cheers,
>
> David / dhildenb
>
>
--
Sincerely yours,
Mike.
On 19.07.24 17:34, Mike Rapoport wrote:
> On Fri, Jul 19, 2024 at 05:07:35PM +0200, David Hildenbrand wrote:
>>>>> - * Allocate node data. Try node-local memory and then any node.
>>>>> - * Never allocate in DMA zone.
>>>>> - */
>>>>> - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
>>>>> - if (!nd_pa) {
>>>>> - pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
>>>>> - nd_size, nid);
>>>>> - return;
>>>>> - }
>>>>> - nd = __va(nd_pa);
>>>>> -
>>>>> - /* report and initialize */
>>>>> - printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
>>>>> - nd_pa, nd_pa + nd_size - 1);
>>>>> - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
>>>>> - if (tnid != nid)
>>>>> - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
>>>>> -
>>>>> - node_data[nid] = nd;
>>>>> - memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
>>>>> -
>>>>> - node_set_online(nid);
>>>>> -}
>>>>> -
>>>>> /**
>>>>> * numa_cleanup_meminfo - Cleanup a numa_meminfo
>>>>> * @mi: numa_meminfo to clean up
>>>>> @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
>>>>> continue;
>>>>> alloc_node_data(nid);
>>>>> + node_set_online(nid);
>>>>> }
>>>>
>>>> I can spot that we only remove a single node_set_online() call from x86.
>>>>
>>>> What about all the other architectures? Will there be any change in behavior
>>>> for them? Or do we simply set the nodes online later once more?
>>>
>>> On x86 node_set_online() was a part of alloc_node_data() and I moved it
>>> outside so it's called right after alloc_node_data(). On other
>>> architectures the allocation didn't include that call, so there should be
>>> no difference there.
>>
>> But won't their arch code try setting the nodes online at a later stage?
>>
>> And I think, some architectures only set nodes online conditionally
>> (see most other node_set_online() calls).
>>
>> Sorry if I'm confused here, but with now unconditional node_set_online(), won't
>> we change the behavior of other architectures?
>
> The generic alloc_node_data() does not set the node online:
>
> +/* Allocate NODE_DATA for a node on the local memory */
> +void __init alloc_node_data(int nid)
> +{
> + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
> + u64 nd_pa;
> + void *nd;
> + int tnid;
> +
> + /* Allocate node data. Try node-local memory and then any node. */
> + nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
> + if (!nd_pa)
> + panic("Cannot allocate %zu bytes for node %d data\n",
> + nd_size, nid);
> + nd = __va(nd_pa);
> +
> + /* report and initialize */
> + pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
> + nd_pa, nd_pa + nd_size - 1);
> + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
> + if (tnid != nid)
> + pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
> +
> + node_data[nid] = nd;
> + memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> +}
>
> I might have missed some architecture except x86 that calls
> node_set_online() in its alloc_node_data(), but the intention was to leave
> that call outside the alloc and explicitly add it after the call to
> alloc_node_data() if needed like in x86.
I'm stupid, I didn't realize it is still only called from x86 :(
Acked-by: David Hildenbrand <david@redhat.com>
--
Cheers,
David / dhildenb
© 2016 - 2025 Red Hat, Inc.