[XEN RFC PATCH 12/40] xen/x86: Move numa_initmem_init to common

Wei Chen posted 40 patches 4 years, 5 months ago
[XEN RFC PATCH 12/40] xen/x86: Move numa_initmem_init to common
Posted by Wei Chen 4 years, 5 months ago
This function can be reused by Arm device tree based
NUMA support. So we move it from x86 to common, as well
as its related variables and functions:
setup_node_bootmem, numa_init_array and numa_emulation.

As numa_initmem_init has been moved to common, _memnodemap
is not used cross files. We can restore _memnodemap to
static.

Signed-off-by: Wei Chen <wei.chen@arm.com>
---
 xen/arch/x86/numa.c         | 118 ----------------------------------
 xen/common/numa.c           | 122 +++++++++++++++++++++++++++++++++++-
 xen/include/asm-x86/numa.h  |   5 --
 xen/include/asm-x86/setup.h |   1 -
 xen/include/xen/numa.h      |   8 ++-
 5 files changed, 128 insertions(+), 126 deletions(-)

diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
index f2626b3968..6908738305 100644
--- a/xen/arch/x86/numa.c
+++ b/xen/arch/x86/numa.c
@@ -38,7 +38,6 @@ nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
 
 nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
 
-bool numa_off;
 s8 acpi_numa = 0;
 
 int srat_disabled(void)
@@ -46,123 +45,6 @@ int srat_disabled(void)
     return numa_off || acpi_numa < 0;
 }
 
-/* initialize NODE_DATA given nodeid and start/end */
-void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
-{ 
-    unsigned long start_pfn, end_pfn;
-
-    start_pfn = start >> PAGE_SHIFT;
-    end_pfn = end >> PAGE_SHIFT;
-
-    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
-    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
-
-    node_set_online(nodeid);
-} 
-
-void __init numa_init_array(void)
-{
-    int rr, i;
-
-    /* There are unfortunately some poorly designed mainboards around
-       that only connect memory to a single CPU. This breaks the 1:1 cpu->node
-       mapping. To avoid this fill in the mapping for all possible
-       CPUs, as the number of CPUs is not known yet.
-       We round robin the existing nodes. */
-    rr = first_node(node_online_map);
-    for ( i = 0; i < nr_cpu_ids; i++ )
-    {
-        if ( cpu_to_node[i] != NUMA_NO_NODE )
-            continue;
-        numa_set_node(i, rr);
-        rr = cycle_node(rr, node_online_map);
-    }
-}
-
-#ifdef CONFIG_NUMA_EMU
-static int numa_fake __initdata = 0;
-
-/* Numa emulation */
-static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
-{
-    int i;
-    struct node nodes[MAX_NUMNODES];
-    u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
-
-    /* Kludge needed for the hash function */
-    if ( hweight64(sz) > 1 )
-    {
-        u64 x = 1;
-        while ( (x << 1) < sz )
-            x <<= 1;
-        if ( x < sz/2 )
-            printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
-        sz = x;
-    }
-
-    memset(&nodes,0,sizeof(nodes));
-    for ( i = 0; i < numa_fake; i++ )
-    {
-        nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
-        if ( i == numa_fake - 1 )
-            sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
-        nodes[i].end = nodes[i].start + sz;
-        printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
-               i,
-               nodes[i].start, nodes[i].end,
-               (nodes[i].end - nodes[i].start) >> 20);
-        node_set_online(i);
-    }
-    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
-    if ( memnode_shift < 0 )
-    {
-        memnode_shift = 0;
-        printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
-        return -1;
-    }
-    for_each_online_node ( i )
-        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-    numa_init_array();
-
-    return 0;
-}
-#endif
-
-void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
-{ 
-    int i;
-
-#ifdef CONFIG_NUMA_EMU
-    if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
-        return;
-#endif
-
-#ifdef CONFIG_ACPI_NUMA
-    if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
-         (u64)end_pfn << PAGE_SHIFT) )
-        return;
-#endif
-
-    printk(KERN_INFO "%s\n",
-           numa_off ? "NUMA turned off" : "No NUMA configuration found");
-
-    printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
-           (u64)start_pfn << PAGE_SHIFT,
-           (u64)end_pfn << PAGE_SHIFT);
-    /* setup dummy node covering all memory */
-    memnode_shift = BITS_PER_LONG - 1;
-    memnodemap = _memnodemap;
-    memnodemapsize = ARRAY_SIZE(_memnodemap);
-
-    nodes_clear(node_online_map);
-    node_set_online(0);
-    for ( i = 0; i < nr_cpu_ids; i++ )
-        numa_set_node(i, 0);
-    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
-    setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
-                    (u64)end_pfn << PAGE_SHIFT);
-}
-
 void numa_set_node(int cpu, nodeid_t node)
 {
     cpu_to_node[cpu] = node;
diff --git a/xen/common/numa.c b/xen/common/numa.c
index 1facc8fe2b..26c0006d04 100644
--- a/xen/common/numa.c
+++ b/xen/common/numa.c
@@ -14,12 +14,13 @@
 #include <xen/smp.h>
 #include <xen/pfn.h>
 #include <xen/sched.h>
+#include <asm/acpi.h>
 
 struct node_data node_data[MAX_NUMNODES];
 
 /* Mapping from pdx to node id */
 int memnode_shift;
-typeof(*memnodemap) _memnodemap[64];
+static typeof(*memnodemap) _memnodemap[64];
 unsigned long memnodemapsize;
 u8 *memnodemap;
 
@@ -34,6 +35,8 @@ int num_node_memblks;
 struct node node_memblk_range[NR_NODE_MEMBLKS];
 nodeid_t memblk_nodeid[NR_NODE_MEMBLKS];
 
+bool numa_off;
+
 /*
  * Given a shift value, try to populate memnodemap[]
  * Returns :
@@ -191,3 +194,120 @@ void numa_add_cpu(int cpu)
 {
     cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
 }
+
+/* initialize NODE_DATA given nodeid and start/end */
+void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
+{
+    unsigned long start_pfn, end_pfn;
+
+    start_pfn = start >> PAGE_SHIFT;
+    end_pfn = end >> PAGE_SHIFT;
+
+    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
+    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+
+    node_set_online(nodeid);
+}
+
+void __init numa_init_array(void)
+{
+    int rr, i;
+
+    /* There are unfortunately some poorly designed mainboards around
+       that only connect memory to a single CPU. This breaks the 1:1 cpu->node
+       mapping. To avoid this fill in the mapping for all possible
+       CPUs, as the number of CPUs is not known yet.
+       We round robin the existing nodes. */
+    rr = first_node(node_online_map);
+    for ( i = 0; i < nr_cpu_ids; i++ )
+    {
+        if ( cpu_to_node[i] != NUMA_NO_NODE )
+            continue;
+        numa_set_node(i, rr);
+        rr = cycle_node(rr, node_online_map);
+    }
+}
+
+#ifdef CONFIG_NUMA_EMU
+int numa_fake __initdata = 0;
+
+/* Numa emulation */
+static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
+{
+    int i;
+    struct node nodes[MAX_NUMNODES];
+    u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
+
+    /* Kludge needed for the hash function */
+    if ( hweight64(sz) > 1 )
+    {
+        u64 x = 1;
+        while ( (x << 1) < sz )
+            x <<= 1;
+        if ( x < sz/2 )
+            printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
+        sz = x;
+    }
+
+    memset(&nodes,0,sizeof(nodes));
+    for ( i = 0; i < numa_fake; i++ )
+    {
+        nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
+        if ( i == numa_fake - 1 )
+            sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
+        nodes[i].end = nodes[i].start + sz;
+        printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
+               i,
+               nodes[i].start, nodes[i].end,
+               (nodes[i].end - nodes[i].start) >> 20);
+        node_set_online(i);
+    }
+    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
+    if ( memnode_shift < 0 )
+    {
+        memnode_shift = 0;
+        printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
+        return -1;
+    }
+    for_each_online_node ( i )
+        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+    numa_init_array();
+
+    return 0;
+}
+#endif
+
+void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+    int i;
+
+#ifdef CONFIG_NUMA_EMU
+    if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
+        return;
+#endif
+
+#ifdef CONFIG_ACPI_NUMA
+    if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
+         (u64)end_pfn << PAGE_SHIFT) )
+        return;
+#endif
+
+    printk(KERN_INFO "%s\n",
+           numa_off ? "NUMA turned off" : "No NUMA configuration found");
+
+    printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
+           (u64)start_pfn << PAGE_SHIFT,
+           (u64)end_pfn << PAGE_SHIFT);
+    /* setup dummy node covering all memory */
+    memnode_shift = BITS_PER_LONG - 1;
+    memnodemap = _memnodemap;
+    memnodemapsize = ARRAY_SIZE(_memnodemap);
+
+    nodes_clear(node_online_map);
+    node_set_online(0);
+    for ( i = 0; i < nr_cpu_ids; i++ )
+        numa_set_node(i, 0);
+    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
+    setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
+                    (u64)end_pfn << PAGE_SHIFT);
+}
diff --git a/xen/include/asm-x86/numa.h b/xen/include/asm-x86/numa.h
index e8a92ad9df..f8e4e15586 100644
--- a/xen/include/asm-x86/numa.h
+++ b/xen/include/asm-x86/numa.h
@@ -21,16 +21,11 @@ extern nodeid_t pxm_to_node(unsigned int pxm);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
-extern void numa_init_array(void);
-extern bool numa_off;
-
-
 extern int srat_disabled(void);
 extern void numa_set_node(int cpu, nodeid_t node);
 extern nodeid_t setup_node(unsigned int pxm);
 extern void srat_detect_node(int cpu);
 
-extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end);
 extern nodeid_t apicid_to_node[];
 extern void init_cpu_to_node(void);
 
diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h
index 24be46115d..63838ba2d1 100644
--- a/xen/include/asm-x86/setup.h
+++ b/xen/include/asm-x86/setup.h
@@ -17,7 +17,6 @@ void early_time_init(void);
 
 void set_nr_cpu_ids(unsigned int max_cpus);
 
-void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
 void arch_init_memory(void);
 void subarch_init_memory(void);
 
diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h
index 67b79a73a3..258a5cb3db 100644
--- a/xen/include/xen/numa.h
+++ b/xen/include/xen/numa.h
@@ -26,7 +26,6 @@
 extern int memnode_shift;
 extern unsigned long memnodemapsize;
 extern u8 *memnodemap;
-extern typeof(*memnodemap) _memnodemap[64];
 
 struct node_data {
     unsigned long node_start_pfn;
@@ -69,6 +68,13 @@ extern int conflicting_memblks(u64 start, u64 end);
 extern void cutoff_node(int i, u64 start, u64 end);
 extern int valid_numa_range(u64 start, u64 end, nodeid_t node);
 
+extern void numa_init_array(void);
+extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern bool numa_off;
+extern int numa_fake;
+
+extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end);
+
 #endif /* CONFIG_NUMA */
 
 #endif /* _XEN_NUMA_H */
-- 
2.25.1


Re: [XEN RFC PATCH 12/40] xen/x86: Move numa_initmem_init to common
Posted by Julien Grall 4 years, 5 months ago
Hi Wei,

On 11/08/2021 11:23, Wei Chen wrote:
> This function can be reused by Arm device tree based
> NUMA support. So we move it from x86 to common, as well
> as its related variables and functions:
> setup_node_bootmem, numa_init_array and numa_emulation.
> 
> As numa_initmem_init has been moved to common, _memnodemap
> is not used cross files. We can restore _memnodemap to
> static.

As we discussed on a previous patch, we should try to avoid this kind of 
dance. I can help to find a split that would achieve that.

> 
> Signed-off-by: Wei Chen <wei.chen@arm.com>
> ---
>   xen/arch/x86/numa.c         | 118 ----------------------------------
>   xen/common/numa.c           | 122 +++++++++++++++++++++++++++++++++++-
>   xen/include/asm-x86/numa.h  |   5 --
>   xen/include/asm-x86/setup.h |   1 -
>   xen/include/xen/numa.h      |   8 ++-
>   5 files changed, 128 insertions(+), 126 deletions(-)
> 
> diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
> index f2626b3968..6908738305 100644
> --- a/xen/arch/x86/numa.c
> +++ b/xen/arch/x86/numa.c
> @@ -38,7 +38,6 @@ nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
>   
>   nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
>   
> -bool numa_off;
>   s8 acpi_numa = 0;
>   
>   int srat_disabled(void)
> @@ -46,123 +45,6 @@ int srat_disabled(void)
>       return numa_off || acpi_numa < 0;
>   }
>   
> -/* initialize NODE_DATA given nodeid and start/end */
> -void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
> -{
> -    unsigned long start_pfn, end_pfn;
> -
> -    start_pfn = start >> PAGE_SHIFT;
> -    end_pfn = end >> PAGE_SHIFT;
> -
> -    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
> -    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
> -
> -    node_set_online(nodeid);
> -}
> -
> -void __init numa_init_array(void)
> -{
> -    int rr, i;
> -
> -    /* There are unfortunately some poorly designed mainboards around
> -       that only connect memory to a single CPU. This breaks the 1:1 cpu->node
> -       mapping. To avoid this fill in the mapping for all possible
> -       CPUs, as the number of CPUs is not known yet.
> -       We round robin the existing nodes. */
> -    rr = first_node(node_online_map);
> -    for ( i = 0; i < nr_cpu_ids; i++ )
> -    {
> -        if ( cpu_to_node[i] != NUMA_NO_NODE )
> -            continue;
> -        numa_set_node(i, rr);
> -        rr = cycle_node(rr, node_online_map);
> -    }
> -}
> -
> -#ifdef CONFIG_NUMA_EMU
> -static int numa_fake __initdata = 0;
> -
> -/* Numa emulation */
> -static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
> -{
> -    int i;
> -    struct node nodes[MAX_NUMNODES];
> -    u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
> -
> -    /* Kludge needed for the hash function */
> -    if ( hweight64(sz) > 1 )
> -    {
> -        u64 x = 1;
> -        while ( (x << 1) < sz )
> -            x <<= 1;
> -        if ( x < sz/2 )
> -            printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
> -        sz = x;
> -    }
> -
> -    memset(&nodes,0,sizeof(nodes));
> -    for ( i = 0; i < numa_fake; i++ )
> -    {
> -        nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
> -        if ( i == numa_fake - 1 )
> -            sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
> -        nodes[i].end = nodes[i].start + sz;
> -        printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
> -               i,
> -               nodes[i].start, nodes[i].end,
> -               (nodes[i].end - nodes[i].start) >> 20);
> -        node_set_online(i);
> -    }
> -    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
> -    if ( memnode_shift < 0 )
> -    {
> -        memnode_shift = 0;
> -        printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
> -        return -1;
> -    }
> -    for_each_online_node ( i )
> -        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
> -    numa_init_array();
> -
> -    return 0;
> -}
> -#endif
> -
> -void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
> -{
> -    int i;
> -
> -#ifdef CONFIG_NUMA_EMU
> -    if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
> -        return;
> -#endif
> -
> -#ifdef CONFIG_ACPI_NUMA
> -    if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
> -         (u64)end_pfn << PAGE_SHIFT) )
> -        return;
> -#endif
> -
> -    printk(KERN_INFO "%s\n",
> -           numa_off ? "NUMA turned off" : "No NUMA configuration found");
> -
> -    printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
> -           (u64)start_pfn << PAGE_SHIFT,
> -           (u64)end_pfn << PAGE_SHIFT);
> -    /* setup dummy node covering all memory */
> -    memnode_shift = BITS_PER_LONG - 1;
> -    memnodemap = _memnodemap;
> -    memnodemapsize = ARRAY_SIZE(_memnodemap);
> -
> -    nodes_clear(node_online_map);
> -    node_set_online(0);
> -    for ( i = 0; i < nr_cpu_ids; i++ )
> -        numa_set_node(i, 0);
> -    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
> -    setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
> -                    (u64)end_pfn << PAGE_SHIFT);
> -}
> -
>   void numa_set_node(int cpu, nodeid_t node)
>   {
>       cpu_to_node[cpu] = node;
> diff --git a/xen/common/numa.c b/xen/common/numa.c
> index 1facc8fe2b..26c0006d04 100644
> --- a/xen/common/numa.c
> +++ b/xen/common/numa.c
> @@ -14,12 +14,13 @@
>   #include <xen/smp.h>
>   #include <xen/pfn.h>
>   #include <xen/sched.h>

NIT: We tend to add a newline betwen <xen/...> headers and <asm/...> 
headers.

> +#include <asm/acpi.h>
>   
>   struct node_data node_data[MAX_NUMNODES];
>   
>   /* Mapping from pdx to node id */
>   int memnode_shift;
> -typeof(*memnodemap) _memnodemap[64];
> +static typeof(*memnodemap) _memnodemap[64];
>   unsigned long memnodemapsize;
>   u8 *memnodemap;
>   
> @@ -34,6 +35,8 @@ int num_node_memblks;
>   struct node node_memblk_range[NR_NODE_MEMBLKS];
>   nodeid_t memblk_nodeid[NR_NODE_MEMBLKS];
>   
> +bool numa_off;
> +
>   /*
>    * Given a shift value, try to populate memnodemap[]
>    * Returns :
> @@ -191,3 +194,120 @@ void numa_add_cpu(int cpu)
>   {
>       cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
>   }
> +
> +/* initialize NODE_DATA given nodeid and start/end */
> +void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)

 From an abstract PoV, start and end should be paddr_t. This should be 
done on a separate patch though.

> +{
> +    unsigned long start_pfn, end_pfn;
> +
> +    start_pfn = start >> PAGE_SHIFT;
> +    end_pfn = end >> PAGE_SHIFT;
> +
> +    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
> +    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
> +
> +    node_set_online(nodeid);
> +}
> +
> +void __init numa_init_array(void)
> +{
> +    int rr, i;
> +
> +    /* There are unfortunately some poorly designed mainboards around
> +       that only connect memory to a single CPU. This breaks the 1:1 cpu->node
> +       mapping. To avoid this fill in the mapping for all possible
> +       CPUs, as the number of CPUs is not known yet.
> +       We round robin the existing nodes. */
> +    rr = first_node(node_online_map);
> +    for ( i = 0; i < nr_cpu_ids; i++ )
> +    {
> +        if ( cpu_to_node[i] != NUMA_NO_NODE )
> +            continue;
> +        numa_set_node(i, rr);
> +        rr = cycle_node(rr, node_online_map);
> +    }
> +}
> +
> +#ifdef CONFIG_NUMA_EMU
> +int numa_fake __initdata = 0;
> +
> +/* Numa emulation */
> +static int __init numa_emulation(u64 start_pfn, u64 end_pfn)

Here, this should be either "unsigned long" or ideally "mfn_t". 
Although, if you use "unsigned long", you will need to...

> +{
> +    int i;
> +    struct node nodes[MAX_NUMNODES];
> +    u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;

... cast "(end_pfn - start_pfn)" to uin64_t or use pfn_to_paddr().

> +
> +    /* Kludge needed for the hash function */
> +    if ( hweight64(sz) > 1 )
> +    {
> +        u64 x = 1;
> +        while ( (x << 1) < sz )
> +            x <<= 1;
> +        if ( x < sz/2 )
> +            printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
> +        sz = x;
> +    }
> +
> +    memset(&nodes,0,sizeof(nodes));
> +    for ( i = 0; i < numa_fake; i++ )
> +    {
> +        nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
> +        if ( i == numa_fake - 1 )
> +            sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
> +        nodes[i].end = nodes[i].start + sz;
> +        printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
> +               i,
> +               nodes[i].start, nodes[i].end,
> +               (nodes[i].end - nodes[i].start) >> 20);
> +        node_set_online(i);
> +    }
> +    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
> +    if ( memnode_shift < 0 )
> +    {
> +        memnode_shift = 0;
> +        printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
> +        return -1;
> +    }
> +    for_each_online_node ( i )
> +        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
> +    numa_init_array();
> +
> +    return 0;
> +}
> +#endif
> +
> +void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
> +{
> +    int i;
> +
> +#ifdef CONFIG_NUMA_EMU
> +    if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
> +        return;
> +#endif
> +
> +#ifdef CONFIG_ACPI_NUMA
> +    if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
> +         (u64)end_pfn << PAGE_SHIFT) )

(u64)v << PAGE_SHIFT should be switched to use pfn_to_paddr() or 
mfn_to_paddr() if you decide to make start_pfn and end_pfn typesafe.

> +        return;
> +#endif
> +
> +    printk(KERN_INFO "%s\n",
> +           numa_off ? "NUMA turned off" : "No NUMA configuration found");
> +
> +    printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
> +           (u64)start_pfn << PAGE_SHIFT,
> +           (u64)end_pfn << PAGE_SHIFT);

Same remark here. PRIx64 would also have to be switched to PRIpaddr.

> +    /* setup dummy node covering all memory */
> +    memnode_shift = BITS_PER_LONG - 1;
> +    memnodemap = _memnodemap;
> +    memnodemapsize = ARRAY_SIZE(_memnodemap);
> +
> +    nodes_clear(node_online_map);
> +    node_set_online(0);
> +    for ( i = 0; i < nr_cpu_ids; i++ )
> +        numa_set_node(i, 0);
> +    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
> +    setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
> +                    (u64)end_pfn << PAGE_SHIFT);
> +}
> diff --git a/xen/include/asm-x86/numa.h b/xen/include/asm-x86/numa.h
> index e8a92ad9df..f8e4e15586 100644
> --- a/xen/include/asm-x86/numa.h
> +++ b/xen/include/asm-x86/numa.h
> @@ -21,16 +21,11 @@ extern nodeid_t pxm_to_node(unsigned int pxm);
>   
>   #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
>   
> -extern void numa_init_array(void);
> -extern bool numa_off;
> -
> -
>   extern int srat_disabled(void);
>   extern void numa_set_node(int cpu, nodeid_t node);
>   extern nodeid_t setup_node(unsigned int pxm);
>   extern void srat_detect_node(int cpu);
>   
> -extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end);
>   extern nodeid_t apicid_to_node[];
>   extern void init_cpu_to_node(void);
>   
> diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h
> index 24be46115d..63838ba2d1 100644
> --- a/xen/include/asm-x86/setup.h
> +++ b/xen/include/asm-x86/setup.h
> @@ -17,7 +17,6 @@ void early_time_init(void);
>   
>   void set_nr_cpu_ids(unsigned int max_cpus);
>   
> -void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
>   void arch_init_memory(void);
>   void subarch_init_memory(void);
>   
> diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h
> index 67b79a73a3..258a5cb3db 100644
> --- a/xen/include/xen/numa.h
> +++ b/xen/include/xen/numa.h
> @@ -26,7 +26,6 @@
>   extern int memnode_shift;
>   extern unsigned long memnodemapsize;
>   extern u8 *memnodemap;
> -extern typeof(*memnodemap) _memnodemap[64];
>   
>   struct node_data {
>       unsigned long node_start_pfn;
> @@ -69,6 +68,13 @@ extern int conflicting_memblks(u64 start, u64 end);
>   extern void cutoff_node(int i, u64 start, u64 end);
>   extern int valid_numa_range(u64 start, u64 end, nodeid_t node);
>   
> +extern void numa_init_array(void);
> +extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
> +extern bool numa_off;
> +extern int numa_fake;
> +
> +extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end);
> +
>   #endif /* CONFIG_NUMA */
>   
>   #endif /* _XEN_NUMA_H */
> 

Cheers,

-- 
Julien Grall

RE: [XEN RFC PATCH 12/40] xen/x86: Move numa_initmem_init to common
Posted by Wei Chen 4 years, 5 months ago
Hi Julien,

> -----Original Message-----
> From: Julien Grall <julien@xen.org>
> Sent: 2021年8月25日 18:22
> To: Wei Chen <Wei.Chen@arm.com>; xen-devel@lists.xenproject.org;
> sstabellini@kernel.org; jbeulich@suse.com
> Cc: Bertrand Marquis <Bertrand.Marquis@arm.com>
> Subject: Re: [XEN RFC PATCH 12/40] xen/x86: Move numa_initmem_init to
> common
> 
> Hi Wei,
> 
> On 11/08/2021 11:23, Wei Chen wrote:
> > This function can be reused by Arm device tree based
> > NUMA support. So we move it from x86 to common, as well
> > as its related variables and functions:
> > setup_node_bootmem, numa_init_array and numa_emulation.
> >
> > As numa_initmem_init has been moved to common, _memnodemap
> > is not used cross files. We can restore _memnodemap to
> > static.
> 
> As we discussed on a previous patch, we should try to avoid this kind of
> dance. I can help to find a split that would achieve that.
> 

Yes, thanks!

> >
> > Signed-off-by: Wei Chen <wei.chen@arm.com>
> > ---
> >   xen/arch/x86/numa.c         | 118 ----------------------------------
> >   xen/common/numa.c           | 122 +++++++++++++++++++++++++++++++++++-
> >   xen/include/asm-x86/numa.h  |   5 --
> >   xen/include/asm-x86/setup.h |   1 -
> >   xen/include/xen/numa.h      |   8 ++-
> >   5 files changed, 128 insertions(+), 126 deletions(-)
> >
> > diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
> > index f2626b3968..6908738305 100644
> > --- a/xen/arch/x86/numa.c
> > +++ b/xen/arch/x86/numa.c
> > @@ -38,7 +38,6 @@ nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
> >
> >   nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
> >
> > -bool numa_off;
> >   s8 acpi_numa = 0;
> >
> >   int srat_disabled(void)
> > @@ -46,123 +45,6 @@ int srat_disabled(void)
> >       return numa_off || acpi_numa < 0;
> >   }
> >
> > -/* initialize NODE_DATA given nodeid and start/end */
> > -void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
> > -{
> > -    unsigned long start_pfn, end_pfn;
> > -
> > -    start_pfn = start >> PAGE_SHIFT;
> > -    end_pfn = end >> PAGE_SHIFT;
> > -
> > -    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
> > -    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
> > -
> > -    node_set_online(nodeid);
> > -}
> > -
> > -void __init numa_init_array(void)
> > -{
> > -    int rr, i;
> > -
> > -    /* There are unfortunately some poorly designed mainboards around
> > -       that only connect memory to a single CPU. This breaks the 1:1
> cpu->node
> > -       mapping. To avoid this fill in the mapping for all possible
> > -       CPUs, as the number of CPUs is not known yet.
> > -       We round robin the existing nodes. */
> > -    rr = first_node(node_online_map);
> > -    for ( i = 0; i < nr_cpu_ids; i++ )
> > -    {
> > -        if ( cpu_to_node[i] != NUMA_NO_NODE )
> > -            continue;
> > -        numa_set_node(i, rr);
> > -        rr = cycle_node(rr, node_online_map);
> > -    }
> > -}
> > -
> > -#ifdef CONFIG_NUMA_EMU
> > -static int numa_fake __initdata = 0;
> > -
> > -/* Numa emulation */
> > -static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
> > -{
> > -    int i;
> > -    struct node nodes[MAX_NUMNODES];
> > -    u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
> > -
> > -    /* Kludge needed for the hash function */
> > -    if ( hweight64(sz) > 1 )
> > -    {
> > -        u64 x = 1;
> > -        while ( (x << 1) < sz )
> > -            x <<= 1;
> > -        if ( x < sz/2 )
> > -            printk(KERN_ERR "Numa emulation unbalanced. Complain to
> maintainer\n");
> > -        sz = x;
> > -    }
> > -
> > -    memset(&nodes,0,sizeof(nodes));
> > -    for ( i = 0; i < numa_fake; i++ )
> > -    {
> > -        nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
> > -        if ( i == numa_fake - 1 )
> > -            sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
> > -        nodes[i].end = nodes[i].start + sz;
> > -        printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64"
> (%"PRIu64"MB)\n",
> > -               i,
> > -               nodes[i].start, nodes[i].end,
> > -               (nodes[i].end - nodes[i].start) >> 20);
> > -        node_set_online(i);
> > -    }
> > -    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
> > -    if ( memnode_shift < 0 )
> > -    {
> > -        memnode_shift = 0;
> > -        printk(KERN_ERR "No NUMA hash function found. Emulation
> disabled.\n");
> > -        return -1;
> > -    }
> > -    for_each_online_node ( i )
> > -        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
> > -    numa_init_array();
> > -
> > -    return 0;
> > -}
> > -#endif
> > -
> > -void __init numa_initmem_init(unsigned long start_pfn, unsigned long
> end_pfn)
> > -{
> > -    int i;
> > -
> > -#ifdef CONFIG_NUMA_EMU
> > -    if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
> > -        return;
> > -#endif
> > -
> > -#ifdef CONFIG_ACPI_NUMA
> > -    if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
> > -         (u64)end_pfn << PAGE_SHIFT) )
> > -        return;
> > -#endif
> > -
> > -    printk(KERN_INFO "%s\n",
> > -           numa_off ? "NUMA turned off" : "No NUMA configuration
> found");
> > -
> > -    printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
> > -           (u64)start_pfn << PAGE_SHIFT,
> > -           (u64)end_pfn << PAGE_SHIFT);
> > -    /* setup dummy node covering all memory */
> > -    memnode_shift = BITS_PER_LONG - 1;
> > -    memnodemap = _memnodemap;
> > -    memnodemapsize = ARRAY_SIZE(_memnodemap);
> > -
> > -    nodes_clear(node_online_map);
> > -    node_set_online(0);
> > -    for ( i = 0; i < nr_cpu_ids; i++ )
> > -        numa_set_node(i, 0);
> > -    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
> > -    setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
> > -                    (u64)end_pfn << PAGE_SHIFT);
> > -}
> > -
> >   void numa_set_node(int cpu, nodeid_t node)
> >   {
> >       cpu_to_node[cpu] = node;
> > diff --git a/xen/common/numa.c b/xen/common/numa.c
> > index 1facc8fe2b..26c0006d04 100644
> > --- a/xen/common/numa.c
> > +++ b/xen/common/numa.c
> > @@ -14,12 +14,13 @@
> >   #include <xen/smp.h>
> >   #include <xen/pfn.h>
> >   #include <xen/sched.h>
> 
> NIT: We tend to add a newline betwen <xen/...> headers and <asm/...>
> headers.
> 

got it

> > +#include <asm/acpi.h>
> >
> >   struct node_data node_data[MAX_NUMNODES];
> >
> >   /* Mapping from pdx to node id */
> >   int memnode_shift;
> > -typeof(*memnodemap) _memnodemap[64];
> > +static typeof(*memnodemap) _memnodemap[64];
> >   unsigned long memnodemapsize;
> >   u8 *memnodemap;
> >
> > @@ -34,6 +35,8 @@ int num_node_memblks;
> >   struct node node_memblk_range[NR_NODE_MEMBLKS];
> >   nodeid_t memblk_nodeid[NR_NODE_MEMBLKS];
> >
> > +bool numa_off;
> > +
> >   /*
> >    * Given a shift value, try to populate memnodemap[]
> >    * Returns :
> > @@ -191,3 +194,120 @@ void numa_add_cpu(int cpu)
> >   {
> >       cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
> >   }
> > +
> > +/* initialize NODE_DATA given nodeid and start/end */
> > +void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
> 
>  From an abstract PoV, start and end should be paddr_t. This should be
> done on a separate patch though.
> 

Ok.

> > +{
> > +    unsigned long start_pfn, end_pfn;
> > +
> > +    start_pfn = start >> PAGE_SHIFT;
> > +    end_pfn = end >> PAGE_SHIFT;
> > +
> > +    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
> > +    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
> > +
> > +    node_set_online(nodeid);
> > +}
> > +
> > +void __init numa_init_array(void)
> > +{
> > +    int rr, i;
> > +
> > +    /* There are unfortunately some poorly designed mainboards around
> > +       that only connect memory to a single CPU. This breaks the 1:1
> cpu->node
> > +       mapping. To avoid this fill in the mapping for all possible
> > +       CPUs, as the number of CPUs is not known yet.
> > +       We round robin the existing nodes. */
> > +    rr = first_node(node_online_map);
> > +    for ( i = 0; i < nr_cpu_ids; i++ )
> > +    {
> > +        if ( cpu_to_node[i] != NUMA_NO_NODE )
> > +            continue;
> > +        numa_set_node(i, rr);
> > +        rr = cycle_node(rr, node_online_map);
> > +    }
> > +}
> > +
> > +#ifdef CONFIG_NUMA_EMU
> > +int numa_fake __initdata = 0;
> > +
> > +/* Numa emulation */
> > +static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
> 
> Here, this should be either "unsigned long" or ideally "mfn_t".
> Although, if you use "unsigned long", you will need to...
> 

Do we need a separate patch to do it?

> > +{
> > +    int i;
> > +    struct node nodes[MAX_NUMNODES];
> > +    u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
> 
> ... cast "(end_pfn - start_pfn)" to uin64_t or use pfn_to_paddr().
> 

Ok

> > +
> > +    /* Kludge needed for the hash function */
> > +    if ( hweight64(sz) > 1 )
> > +    {
> > +        u64 x = 1;
> > +        while ( (x << 1) < sz )
> > +            x <<= 1;
> > +        if ( x < sz/2 )
> > +            printk(KERN_ERR "Numa emulation unbalanced. Complain to
> maintainer\n");
> > +        sz = x;
> > +    }
> > +
> > +    memset(&nodes,0,sizeof(nodes));
> > +    for ( i = 0; i < numa_fake; i++ )
> > +    {
> > +        nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
> > +        if ( i == numa_fake - 1 )
> > +            sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
> > +        nodes[i].end = nodes[i].start + sz;
> > +        printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64"
> (%"PRIu64"MB)\n",
> > +               i,
> > +               nodes[i].start, nodes[i].end,
> > +               (nodes[i].end - nodes[i].start) >> 20);
> > +        node_set_online(i);
> > +    }
> > +    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
> > +    if ( memnode_shift < 0 )
> > +    {
> > +        memnode_shift = 0;
> > +        printk(KERN_ERR "No NUMA hash function found. Emulation
> disabled.\n");
> > +        return -1;
> > +    }
> > +    for_each_online_node ( i )
> > +        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
> > +    numa_init_array();
> > +
> > +    return 0;
> > +}
> > +#endif
> > +
> > +void __init numa_initmem_init(unsigned long start_pfn, unsigned long
> end_pfn)
> > +{
> > +    int i;
> > +
> > +#ifdef CONFIG_NUMA_EMU
> > +    if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
> > +        return;
> > +#endif
> > +
> > +#ifdef CONFIG_ACPI_NUMA
> > +    if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
> > +         (u64)end_pfn << PAGE_SHIFT) )
> 
> (u64)v << PAGE_SHIFT should be switched to use pfn_to_paddr() or
> mfn_to_paddr() if you decide to make start_pfn and end_pfn typesafe.
> 

Still need a separate patch to change it before move?

> > +        return;
> > +#endif
> > +
> > +    printk(KERN_INFO "%s\n",
> > +           numa_off ? "NUMA turned off" : "No NUMA configuration
> found");
> > +
> > +    printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
> > +           (u64)start_pfn << PAGE_SHIFT,
> > +           (u64)end_pfn << PAGE_SHIFT);
> 
> Same remark here. PRIx64 would also have to be switched to PRIpaddr.
> 

Hmm, It seems I'd better to use a separate patch to do PRIpaddr clean up
before move code.

> > +    /* setup dummy node covering all memory */
> > +    memnode_shift = BITS_PER_LONG - 1;
> > +    memnodemap = _memnodemap;
> > +    memnodemapsize = ARRAY_SIZE(_memnodemap);
> > +
> > +    nodes_clear(node_online_map);
> > +    node_set_online(0);
> > +    for ( i = 0; i < nr_cpu_ids; i++ )
> > +        numa_set_node(i, 0);
> > +    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
> > +    setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
> > +                    (u64)end_pfn << PAGE_SHIFT);
> > +}
> > diff --git a/xen/include/asm-x86/numa.h b/xen/include/asm-x86/numa.h
> > index e8a92ad9df..f8e4e15586 100644
> > --- a/xen/include/asm-x86/numa.h
> > +++ b/xen/include/asm-x86/numa.h
> > @@ -21,16 +21,11 @@ extern nodeid_t pxm_to_node(unsigned int pxm);
> >
> >   #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
> >
> > -extern void numa_init_array(void);
> > -extern bool numa_off;
> > -
> > -
> >   extern int srat_disabled(void);
> >   extern void numa_set_node(int cpu, nodeid_t node);
> >   extern nodeid_t setup_node(unsigned int pxm);
> >   extern void srat_detect_node(int cpu);
> >
> > -extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end);
> >   extern nodeid_t apicid_to_node[];
> >   extern void init_cpu_to_node(void);
> >
> > diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h
> > index 24be46115d..63838ba2d1 100644
> > --- a/xen/include/asm-x86/setup.h
> > +++ b/xen/include/asm-x86/setup.h
> > @@ -17,7 +17,6 @@ void early_time_init(void);
> >
> >   void set_nr_cpu_ids(unsigned int max_cpus);
> >
> > -void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
> >   void arch_init_memory(void);
> >   void subarch_init_memory(void);
> >
> > diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h
> > index 67b79a73a3..258a5cb3db 100644
> > --- a/xen/include/xen/numa.h
> > +++ b/xen/include/xen/numa.h
> > @@ -26,7 +26,6 @@
> >   extern int memnode_shift;
> >   extern unsigned long memnodemapsize;
> >   extern u8 *memnodemap;
> > -extern typeof(*memnodemap) _memnodemap[64];
> >
> >   struct node_data {
> >       unsigned long node_start_pfn;
> > @@ -69,6 +68,13 @@ extern int conflicting_memblks(u64 start, u64 end);
> >   extern void cutoff_node(int i, u64 start, u64 end);
> >   extern int valid_numa_range(u64 start, u64 end, nodeid_t node);
> >
> > +extern void numa_init_array(void);
> > +extern void numa_initmem_init(unsigned long start_pfn, unsigned long
> end_pfn);
> > +extern bool numa_off;
> > +extern int numa_fake;
> > +
> > +extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end);
> > +
> >   #endif /* CONFIG_NUMA */
> >
> >   #endif /* _XEN_NUMA_H */
> >
> 
> Cheers,
> 
> --
> Julien Grall
Re: [XEN RFC PATCH 12/40] xen/x86: Move numa_initmem_init to common
Posted by Julien Grall 4 years, 5 months ago

On 25/08/2021 12:15, Wei Chen wrote:
> Hi Julien,

Hi Wei,

>> -----Original Message-----
>> From: Julien Grall <julien@xen.org>
>> Sent: 2021年8月25日 18:22
>> To: Wei Chen <Wei.Chen@arm.com>; xen-devel@lists.xenproject.org;
>> sstabellini@kernel.org; jbeulich@suse.com
>> Cc: Bertrand Marquis <Bertrand.Marquis@arm.com>
>> Subject: Re: [XEN RFC PATCH 12/40] xen/x86: Move numa_initmem_init to
>> common
>>
>> Hi Wei,
>>
>> On 11/08/2021 11:23, Wei Chen wrote:
>>> This function can be reused by Arm device tree based
>>> NUMA support. So we move it from x86 to common, as well
>>> as its related variables and functions:
>>> setup_node_bootmem, numa_init_array and numa_emulation.
>>>
>>> As numa_initmem_init has been moved to common, _memnodemap
>>> is not used cross files. We can restore _memnodemap to
>>> static.
>>
>> As we discussed on a previous patch, we should try to avoid this kind of
>> dance. I can help to find a split that would achieve that.
>>
> 
> Yes, thanks!
> 
>>>
>>> Signed-off-by: Wei Chen <wei.chen@arm.com>
>>> ---
>>>    xen/arch/x86/numa.c         | 118 ----------------------------------
>>>    xen/common/numa.c           | 122 +++++++++++++++++++++++++++++++++++-
>>>    xen/include/asm-x86/numa.h  |   5 --
>>>    xen/include/asm-x86/setup.h |   1 -
>>>    xen/include/xen/numa.h      |   8 ++-
>>>    5 files changed, 128 insertions(+), 126 deletions(-)
>>>
>>> diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
>>> index f2626b3968..6908738305 100644
>>> --- a/xen/arch/x86/numa.c
>>> +++ b/xen/arch/x86/numa.c
>>> @@ -38,7 +38,6 @@ nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
>>>
>>>    nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
>>>
>>> -bool numa_off;
>>>    s8 acpi_numa = 0;
>>>
>>>    int srat_disabled(void)
>>> @@ -46,123 +45,6 @@ int srat_disabled(void)
>>>        return numa_off || acpi_numa < 0;
>>>    }
>>>
>>> -/* initialize NODE_DATA given nodeid and start/end */
>>> -void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
>>> -{
>>> -    unsigned long start_pfn, end_pfn;
>>> -
>>> -    start_pfn = start >> PAGE_SHIFT;
>>> -    end_pfn = end >> PAGE_SHIFT;
>>> -
>>> -    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
>>> -    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
>>> -
>>> -    node_set_online(nodeid);
>>> -}
>>> -
>>> -void __init numa_init_array(void)
>>> -{
>>> -    int rr, i;
>>> -
>>> -    /* There are unfortunately some poorly designed mainboards around
>>> -       that only connect memory to a single CPU. This breaks the 1:1
>> cpu->node
>>> -       mapping. To avoid this fill in the mapping for all possible
>>> -       CPUs, as the number of CPUs is not known yet.
>>> -       We round robin the existing nodes. */
>>> -    rr = first_node(node_online_map);
>>> -    for ( i = 0; i < nr_cpu_ids; i++ )
>>> -    {
>>> -        if ( cpu_to_node[i] != NUMA_NO_NODE )
>>> -            continue;
>>> -        numa_set_node(i, rr);
>>> -        rr = cycle_node(rr, node_online_map);
>>> -    }
>>> -}
>>> -
>>> -#ifdef CONFIG_NUMA_EMU
>>> -static int numa_fake __initdata = 0;
>>> -
>>> -/* Numa emulation */
>>> -static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
>>> -{
>>> -    int i;
>>> -    struct node nodes[MAX_NUMNODES];
>>> -    u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
>>> -
>>> -    /* Kludge needed for the hash function */
>>> -    if ( hweight64(sz) > 1 )
>>> -    {
>>> -        u64 x = 1;
>>> -        while ( (x << 1) < sz )
>>> -            x <<= 1;
>>> -        if ( x < sz/2 )
>>> -            printk(KERN_ERR "Numa emulation unbalanced. Complain to
>> maintainer\n");
>>> -        sz = x;
>>> -    }
>>> -
>>> -    memset(&nodes,0,sizeof(nodes));
>>> -    for ( i = 0; i < numa_fake; i++ )
>>> -    {
>>> -        nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
>>> -        if ( i == numa_fake - 1 )
>>> -            sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
>>> -        nodes[i].end = nodes[i].start + sz;
>>> -        printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64"
>> (%"PRIu64"MB)\n",
>>> -               i,
>>> -               nodes[i].start, nodes[i].end,
>>> -               (nodes[i].end - nodes[i].start) >> 20);
>>> -        node_set_online(i);
>>> -    }
>>> -    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
>>> -    if ( memnode_shift < 0 )
>>> -    {
>>> -        memnode_shift = 0;
>>> -        printk(KERN_ERR "No NUMA hash function found. Emulation
>> disabled.\n");
>>> -        return -1;
>>> -    }
>>> -    for_each_online_node ( i )
>>> -        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
>>> -    numa_init_array();
>>> -
>>> -    return 0;
>>> -}
>>> -#endif
>>> -
>>> -void __init numa_initmem_init(unsigned long start_pfn, unsigned long
>> end_pfn)
>>> -{
>>> -    int i;
>>> -
>>> -#ifdef CONFIG_NUMA_EMU
>>> -    if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
>>> -        return;
>>> -#endif
>>> -
>>> -#ifdef CONFIG_ACPI_NUMA
>>> -    if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
>>> -         (u64)end_pfn << PAGE_SHIFT) )
>>> -        return;
>>> -#endif
>>> -
>>> -    printk(KERN_INFO "%s\n",
>>> -           numa_off ? "NUMA turned off" : "No NUMA configuration
>> found");
>>> -
>>> -    printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
>>> -           (u64)start_pfn << PAGE_SHIFT,
>>> -           (u64)end_pfn << PAGE_SHIFT);
>>> -    /* setup dummy node covering all memory */
>>> -    memnode_shift = BITS_PER_LONG - 1;
>>> -    memnodemap = _memnodemap;
>>> -    memnodemapsize = ARRAY_SIZE(_memnodemap);
>>> -
>>> -    nodes_clear(node_online_map);
>>> -    node_set_online(0);
>>> -    for ( i = 0; i < nr_cpu_ids; i++ )
>>> -        numa_set_node(i, 0);
>>> -    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
>>> -    setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
>>> -                    (u64)end_pfn << PAGE_SHIFT);
>>> -}
>>> -
>>>    void numa_set_node(int cpu, nodeid_t node)
>>>    {
>>>        cpu_to_node[cpu] = node;
>>> diff --git a/xen/common/numa.c b/xen/common/numa.c
>>> index 1facc8fe2b..26c0006d04 100644
>>> --- a/xen/common/numa.c
>>> +++ b/xen/common/numa.c
>>> @@ -14,12 +14,13 @@
>>>    #include <xen/smp.h>
>>>    #include <xen/pfn.h>
>>>    #include <xen/sched.h>
>>
>> NIT: We tend to add a newline betwen <xen/...> headers and <asm/...>
>> headers.
>>
> 
> got it
> 
>>> +#include <asm/acpi.h>
>>>
>>>    struct node_data node_data[MAX_NUMNODES];
>>>
>>>    /* Mapping from pdx to node id */
>>>    int memnode_shift;
>>> -typeof(*memnodemap) _memnodemap[64];
>>> +static typeof(*memnodemap) _memnodemap[64];
>>>    unsigned long memnodemapsize;
>>>    u8 *memnodemap;
>>>
>>> @@ -34,6 +35,8 @@ int num_node_memblks;
>>>    struct node node_memblk_range[NR_NODE_MEMBLKS];
>>>    nodeid_t memblk_nodeid[NR_NODE_MEMBLKS];
>>>
>>> +bool numa_off;
>>> +
>>>    /*
>>>     * Given a shift value, try to populate memnodemap[]
>>>     * Returns :
>>> @@ -191,3 +194,120 @@ void numa_add_cpu(int cpu)
>>>    {
>>>        cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
>>>    }
>>> +
>>> +/* initialize NODE_DATA given nodeid and start/end */
>>> +void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
>>
>>   From an abstract PoV, start and end should be paddr_t. This should be
>> done on a separate patch though.
>>
> 
> Ok.
> 
>>> +{
>>> +    unsigned long start_pfn, end_pfn;
>>> +
>>> +    start_pfn = start >> PAGE_SHIFT;
>>> +    end_pfn = end >> PAGE_SHIFT;
>>> +
>>> +    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
>>> +    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
>>> +
>>> +    node_set_online(nodeid);
>>> +}
>>> +
>>> +void __init numa_init_array(void)
>>> +{
>>> +    int rr, i;
>>> +
>>> +    /* There are unfortunately some poorly designed mainboards around
>>> +       that only connect memory to a single CPU. This breaks the 1:1
>> cpu->node
>>> +       mapping. To avoid this fill in the mapping for all possible
>>> +       CPUs, as the number of CPUs is not known yet.
>>> +       We round robin the existing nodes. */
>>> +    rr = first_node(node_online_map);
>>> +    for ( i = 0; i < nr_cpu_ids; i++ )
>>> +    {
>>> +        if ( cpu_to_node[i] != NUMA_NO_NODE )
>>> +            continue;
>>> +        numa_set_node(i, rr);
>>> +        rr = cycle_node(rr, node_online_map);
>>> +    }
>>> +}
>>> +
>>> +#ifdef CONFIG_NUMA_EMU
>>> +int numa_fake __initdata = 0;
>>> +
>>> +/* Numa emulation */
>>> +static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
>>
>> Here, this should be either "unsigned long" or ideally "mfn_t".
>> Although, if you use "unsigned long", you will need to...
>>
> 
> Do we need a separate patch to do it?

I would prefer if the cleanups are done separately as this makes easier 
to review code movement.

> 
>>> +{
>>> +    int i;
>>> +    struct node nodes[MAX_NUMNODES];
>>> +    u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
>>
>> ... cast "(end_pfn - start_pfn)" to uin64_t or use pfn_to_paddr().
>>
> 
> Ok
> 
>>> +
>>> +    /* Kludge needed for the hash function */
>>> +    if ( hweight64(sz) > 1 )
>>> +    {
>>> +        u64 x = 1;
>>> +        while ( (x << 1) < sz )
>>> +            x <<= 1;
>>> +        if ( x < sz/2 )
>>> +            printk(KERN_ERR "Numa emulation unbalanced. Complain to
>> maintainer\n");
>>> +        sz = x;
>>> +    }
>>> +
>>> +    memset(&nodes,0,sizeof(nodes));
>>> +    for ( i = 0; i < numa_fake; i++ )
>>> +    {
>>> +        nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
>>> +        if ( i == numa_fake - 1 )
>>> +            sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
>>> +        nodes[i].end = nodes[i].start + sz;
>>> +        printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64"
>> (%"PRIu64"MB)\n",
>>> +               i,
>>> +               nodes[i].start, nodes[i].end,
>>> +               (nodes[i].end - nodes[i].start) >> 20);
>>> +        node_set_online(i);
>>> +    }
>>> +    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
>>> +    if ( memnode_shift < 0 )
>>> +    {
>>> +        memnode_shift = 0;
>>> +        printk(KERN_ERR "No NUMA hash function found. Emulation
>> disabled.\n");
>>> +        return -1;
>>> +    }
>>> +    for_each_online_node ( i )
>>> +        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
>>> +    numa_init_array();
>>> +
>>> +    return 0;
>>> +}
>>> +#endif
>>> +
>>> +void __init numa_initmem_init(unsigned long start_pfn, unsigned long
>> end_pfn)
>>> +{
>>> +    int i;
>>> +
>>> +#ifdef CONFIG_NUMA_EMU
>>> +    if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
>>> +        return;
>>> +#endif
>>> +
>>> +#ifdef CONFIG_ACPI_NUMA
>>> +    if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
>>> +         (u64)end_pfn << PAGE_SHIFT) )
>>
>> (u64)v << PAGE_SHIFT should be switched to use pfn_to_paddr() or
>> mfn_to_paddr() if you decide to make start_pfn and end_pfn typesafe.
>>
> 
> Still need a separate patch to change it before move?

Yes.

Cheers,

-- 
Julien Grall