[RFC][PATCH 2/6] x86/topo: Add TOPO_NUMA_DOMAIN

Peter Zijlstra posted 6 patches 1 month, 1 week ago
There is a newer version of this series
[RFC][PATCH 2/6] x86/topo: Add TOPO_NUMA_DOMAIN
Posted by Peter Zijlstra 1 month, 1 week ago
Use the SRAT data to add an extra NUMA domain. Since the SLIT table is
a matrix, the SRAT proximity domain 'must' be a dense set and will not
exceed MAX_LOCAL_APIC.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/topology.h |    3 +++
 arch/x86/kernel/cpu/topology.c  |    9 +++++++++
 2 files changed, 12 insertions(+)

--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -111,6 +111,9 @@ enum x86_topology_domains {
 	TOPO_DIE_DOMAIN,
 	TOPO_DIEGRP_DOMAIN,
 	TOPO_PKG_DOMAIN,
+#ifdef CONFIG_NUMA
+	TOPO_NUMA_DOMAIN,
+#endif
 	TOPO_MAX_DOMAIN,
 };
 
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -31,6 +31,7 @@
 #include <asm/mpspec.h>
 #include <asm/msr.h>
 #include <asm/smp.h>
+#include <asm/numa.h>
 
 #include "cpu.h"
 
@@ -88,6 +89,14 @@ static inline u32 topo_apicid(u32 apicid
 {
 	if (dom == TOPO_SMT_DOMAIN)
 		return apicid;
+#ifdef CONFIG_NUMA
+	if (dom == TOPO_NUMA_DOMAIN) {
+		int nid = __apicid_to_phys_node[apicid];
+		if (nid == NUMA_NO_NODE)
+			nid = 0;
+		return nid;
+	}
+#endif
 	return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
 }
Re: [RFC][PATCH 2/6] x86/topo: Add TOPO_NUMA_DOMAIN
Posted by K Prateek Nayak 1 month, 1 week ago
Hello Peter,

On 2/26/2026 4:19 PM, Peter Zijlstra wrote:
> @@ -88,6 +89,14 @@ static inline u32 topo_apicid(u32 apicid
>  {
>  	if (dom == TOPO_SMT_DOMAIN)
>  		return apicid;
> +#ifdef CONFIG_NUMA
> +	if (dom == TOPO_NUMA_DOMAIN) {
> +		int nid = __apicid_to_phys_node[apicid];
> +		if (nid == NUMA_NO_NODE)
> +			nid = 0;
> +		return nid;
> +	}
> +#endif

I'm not digging this override - simply because topo_apicid() was not
meant to handle these kinds of cases where we cannot derive a topology
ID by simply shifting and masking the APICID.

Looking at the series, all we need is an equivalent of:

  domain_weight(TOPO_NUMA_DOMAIN)

so can we do something like the following on top of the changes in this
series:

  (!CONFIG_NUMA has only been build tested)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 5d6da2ad84e5..05461e2cd931 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -53,6 +53,8 @@ extern void __init init_cpu_to_node(void);
 extern void numa_add_cpu(unsigned int cpu);
 extern void numa_remove_cpu(unsigned int cpu);
 extern void init_gi_nodes(void);
+extern void __init topo_register_apic_phys_node(int apicid);
+extern int num_phys_nodes(void);
 #else	/* CONFIG_NUMA */
 static inline void numa_set_node(int cpu, int node)	{ }
 static inline void numa_clear_node(int cpu)		{ }
@@ -60,6 +62,11 @@ static inline void init_cpu_to_node(void)		{ }
 static inline void numa_add_cpu(unsigned int cpu)	{ }
 static inline void numa_remove_cpu(unsigned int cpu)	{ }
 static inline void init_gi_nodes(void)			{ }
+static inline void __init topo_register_apic_phys_node(int apicid) { }
+static inline int num_phys_nodes(void)
+{
+	return 1;
+}
 #endif	/* CONFIG_NUMA */
 
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 7fe9ea4ee1e7..9b3f92c5f0e0 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -111,9 +111,6 @@ enum x86_topology_domains {
 	TOPO_DIE_DOMAIN,
 	TOPO_DIEGRP_DOMAIN,
 	TOPO_PKG_DOMAIN,
-#ifdef CONFIG_NUMA
-	TOPO_NUMA_DOMAIN,
-#endif
 	TOPO_MAX_DOMAIN,
 };
 
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 399388213bc0..1d3bed3ae40e 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -89,14 +89,6 @@ static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom)
 {
 	if (dom == TOPO_SMT_DOMAIN)
 		return apicid;
-#ifdef CONFIG_NUMA
-	if (dom == TOPO_NUMA_DOMAIN) {
-		int nid = __apicid_to_phys_node[apicid];
-		if (nid == NUMA_NO_NODE)
-			nid = 0;
-		return nid;
-	}
-#endif
 	return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
 }
 
@@ -254,6 +246,8 @@ static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
 	 */
 	for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
 		set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
+
+	topo_register_apic_phys_node(apic_id);
 }
 
 /**
@@ -501,7 +495,7 @@ void __init topology_init_possible_cpus(void)
 	set_nr_cpu_ids(allowed);
 
 	cnta = domain_weight(TOPO_PKG_DOMAIN);
-	cntb = domain_weight(TOPO_NUMA_DOMAIN);
+	cntb = num_phys_nodes();
 
 	__num_nodes_per_package = DIV_ROUND_UP(cntb, cnta);
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4556a1561aa0..b60076745a32 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -52,6 +52,8 @@ s16 __apicid_to_phys_node[MAX_LOCAL_APIC] = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 
+static nodemask_t apic_phys_node_map __ro_after_init;
+
 int numa_cpu_node(int cpu)
 {
 	u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
@@ -61,6 +63,24 @@ int numa_cpu_node(int cpu)
 	return NUMA_NO_NODE;
 }
 
+static int topo_apicid_to_node(int apicid)
+{
+	int nid = __apicid_to_phys_node[apicid];
+	if (nid == NUMA_NO_NODE)
+		nid = 0;
+	return nid;
+}
+
+void __init topo_register_apic_phys_node(int apicid)
+{
+	set_bit(topo_apicid_to_node(apicid), apic_phys_node_map.bits);
+}
+
+int __init num_phys_nodes(void)
+{
+	return bitmap_weight(apic_phys_node_map.bits, MAX_NUMNODES);
+}
+
 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
 EXPORT_SYMBOL(node_to_cpumask_map);
 
---

Slightly larger diffstat but all the NUMA bits are together.

Thoughts?

>  	return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
>  }
>  

-- 
Thanks and Regards,
Prateek
Re: [RFC][PATCH 2/6] x86/topo: Add TOPO_NUMA_DOMAIN
Posted by Peter Zijlstra 1 month, 1 week ago
On Fri, Feb 27, 2026 at 06:49:36PM +0530, K Prateek Nayak wrote:
> Hello Peter,
> 
> On 2/26/2026 4:19 PM, Peter Zijlstra wrote:
> > @@ -88,6 +89,14 @@ static inline u32 topo_apicid(u32 apicid
> >  {
> >  	if (dom == TOPO_SMT_DOMAIN)
> >  		return apicid;
> > +#ifdef CONFIG_NUMA
> > +	if (dom == TOPO_NUMA_DOMAIN) {
> > +		int nid = __apicid_to_phys_node[apicid];
> > +		if (nid == NUMA_NO_NODE)
> > +			nid = 0;
> > +		return nid;
> > +	}
> > +#endif
> 
> I'm not digging this override - simply because topo_apicid() was not
> meant to handle these kinds of cases where we cannot derive a topology
> ID by simply shifting and masking the APICID.
> 
> Looking at the series, all we need is an equivalent of:
> 
>   domain_weight(TOPO_NUMA_DOMAIN)

Fair enough; but then lets replace patch 1 and 2 with something like
that.

But I must note that the nodemask API is crap; it has both node_set() and
__node_set() be the atomic version :-(

Let me go rework the other patches to fit on this.

---
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 53ba39ce010c..a9063f332fa6 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -22,6 +22,7 @@ extern int numa_off;
  */
 extern s16 __apicid_to_node[MAX_LOCAL_APIC];
 extern nodemask_t numa_nodes_parsed __initdata;
+extern nodemask_t numa_phys_nodes_parsed __initdata;
 
 static inline void set_apicid_to_node(int apicid, s16 node)
 {
@@ -48,6 +49,7 @@ extern void __init init_cpu_to_node(void);
 extern void numa_add_cpu(unsigned int cpu);
 extern void numa_remove_cpu(unsigned int cpu);
 extern void init_gi_nodes(void);
+extern int num_phys_nodes(void);
 #else	/* CONFIG_NUMA */
 static inline void numa_set_node(int cpu, int node)	{ }
 static inline void numa_clear_node(int cpu)		{ }
@@ -55,6 +57,10 @@ static inline void init_cpu_to_node(void)		{ }
 static inline void numa_add_cpu(unsigned int cpu)	{ }
 static inline void numa_remove_cpu(unsigned int cpu)	{ }
 static inline void init_gi_nodes(void)			{ }
+static inline int num_phys_nodes(void)
+{
+	return 1;
+}
 #endif	/* CONFIG_NUMA */
 
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 23190a786d31..bfcd33127789 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -31,6 +31,7 @@
 #include <asm/mpspec.h>
 #include <asm/msr.h>
 #include <asm/smp.h>
+#include <asm/numa.h>
 
 #include "cpu.h"
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 7a97327140df..99d0a9332c14 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -48,6 +48,8 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 
+nodemask_t numa_phys_nodes_parsed __initdata;
+
 int numa_cpu_node(int cpu)
 {
 	u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
@@ -57,6 +59,11 @@ int numa_cpu_node(int cpu)
 	return NUMA_NO_NODE;
 }
 
+int __init num_phys_nodes(void)
+{
+	return bitmap_weight(numa_phys_nodes_parsed.bits, MAX_NUMNODES);
+}
+
 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
 EXPORT_SYMBOL(node_to_cpumask_map);
 
@@ -210,6 +217,7 @@ static int __init dummy_numa_init(void)
 	       0LLU, PFN_PHYS(max_pfn) - 1);
 
 	node_set(0, numa_nodes_parsed);
+	node_set(0, numa_phys_nodes_parsed);
 	numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
 
 	return 0;
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 6f8e0f21c710..44ca66651756 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -57,6 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 	}
 	set_apicid_to_node(apic_id, node);
 	node_set(node, numa_nodes_parsed);
+	node_set(node, numa_phys_nodes_parsed);
 	pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node);
 }
 
@@ -97,6 +98,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 
 	set_apicid_to_node(apic_id, node);
 	node_set(node, numa_nodes_parsed);
+	node_set(node, numa_phys_nodes_parsed);
 	pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node);
 }
Re: [RFC][PATCH 2/6] x86/topo: Add TOPO_NUMA_DOMAIN
Posted by K Prateek Nayak 1 month ago
Hello Peter,

On 2/27/2026 7:36 PM, Peter Zijlstra wrote:
>> Looking at the series, all we need is an equivalent of:
>>
>>   domain_weight(TOPO_NUMA_DOMAIN)
> 
> Fair enough; but then lets replace patch 1 and 2 with something like
> that.
> 
> But I must note that the nodemask API is crap; it has both node_set() and
> __node_set() be the atomic version :-(
> 
> Let me go rework the other patches to fit on this.

Boots fine with a s/domain_weight(TOPO_NUMA_DOMAIN)/num_phys_nodes()/
applied to Patch 3.

Topology looks fine for NPS4 on my 3rd Generation EPYC with 2 sockets,
and I haven't triggered any warning even with "L3 as NUMA" turned on.
Feel free to include:

Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>

-- 
Thanks and Regards,
Prateek
Re: [RFC][PATCH 2/6] x86/topo: Add TOPO_NUMA_DOMAIN
Posted by Peter Zijlstra 1 month ago
On Mon, Mar 02, 2026 at 09:46:57AM +0530, K Prateek Nayak wrote:
> Hello Peter,
> 
> On 2/27/2026 7:36 PM, Peter Zijlstra wrote:
> >> Looking at the series, all we need is an equivalent of:
> >>
> >>   domain_weight(TOPO_NUMA_DOMAIN)
> > 
> > Fair enough; but then lets replace patch 1 and 2 with something like
> > that.
> > 
> > But I must note that the nodemask API is crap; it has both node_set() and
> > __node_set() be the atomic version :-(
> > 
> > Let me go rework the other patches to fit on this.
> 
> Boots fine with a s/domain_weight(TOPO_NUMA_DOMAIN)/num_phys_nodes()/
> applied to Patch 3.
> 
> Topology looks fine for NPS4 on my 3rd Generation EPYC with 2 sockets,
> and I haven't triggered any warning even with "L3 as NUMA" turned on.
> Feel free to include:
> 
> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>

Thanks!

I had a quick look at this NPS stuff, and that is more or less the same
as the intel SNC thing. With two notable exceptions:

 - you've stuck to power-of-two numbers (good!)

 - NPS0; I don't think Intel has anything like that (although I could be
   mistaken).

Now, the __num_nodes_per_package is obviously not going to work for
NPS0 (it bottoms out at 1).

Should we look at adding something for NPS0, or has that not been needed
(yet) ?
Re: [RFC][PATCH 2/6] x86/topo: Add TOPO_NUMA_DOMAIN
Posted by K Prateek Nayak 1 month ago
Hello Peter,

On 3/2/2026 8:40 PM, Peter Zijlstra wrote:
> On Mon, Mar 02, 2026 at 09:46:57AM +0530, K Prateek Nayak wrote:
>> Hello Peter,
>>
>> On 2/27/2026 7:36 PM, Peter Zijlstra wrote:
>>>> Looking at the series, all we need is an equivalent of:
>>>>
>>>>   domain_weight(TOPO_NUMA_DOMAIN)
>>>
>>> Fair enough; but then lets replace patch 1 and 2 with something like
>>> that.
>>>
>>> But I must note that the nodemask API is crap; it has both node_set() and
>>> __node_set() be the atomic version :-(
>>>
>>> Let me go rework the other patches to fit on this.
>>
>> Boots fine with a s/domain_weight(TOPO_NUMA_DOMAIN)/num_phys_nodes()/
>> applied to Patch 3.
>>
>> Topology looks fine for NPS4 on my 3rd Generation EPYC with 2 sockets,
>> and I haven't triggered any warning even with "L3 as NUMA" turned on.
>> Feel free to include:
>>
>> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> 
> Thanks!
> 
> I had a quick look at this NPS stuff, and that is more or less the same
> as the intel SNC thing. With two notable exceptions:
> 
>  - you've stuck to power-of-two numbers (good!)

Yeah but "L3 as NUMA" on a 6CCX machines doesn't follow that :-(
Is there any implicit dependency there?

P.S. All these configs are symmetric so those divisions should give the
correct results.

> 
>  - NPS0; I don't think Intel has anything like that (although I could be
>    mistaken).
> 
> Now, the __num_nodes_per_package is obviously not going to work for
> NPS0 (it bottoms out at 1).
> 
> Should we look at adding something for NPS0, or has that not been needed
> (yet) ?

Let me go boot into NPS0 to see what my machine thinks. But it shouldn't
do any harm right because of the DIV_ROUND_UP() right?

__num_nodes_per_package will be 1 (which is technically correct since
the whole package is indeed one node) and then we retain the PKG domain
so as far as those bits are concerned, it should be fine.

This is from my dual socket Zen3 booted into NPS0:

    CPU topo: Max. logical packages:   2
    CPU topo: Max. logical nodes:      1
    CPU topo: Num. nodes per package:  1
    CPU topo: Max. logical dies:       2
    CPU topo: Max. dies per package:   1
    CPU topo: Max. threads per core:   2
    CPU topo: Num. cores per package:    64
    CPU topo: Num. threads per package: 128
    CPU topo: Allowing 256 present CPUs plus 0 hotplug CPUs


CPU0's scheduler topology looks like:

    CPU0 attaching sched-domain(s):
     domain-0: span=0,128 level=SMT
      groups: 0:{ span=0 },
            128:{ span=128 }
      domain-1: span=0-7,128-135 level=MC
       groups: 0:{ span=0,128 cap=2048 },
               1:{ span=1,129 cap=2048 },
               2:{ span=2,130 cap=2048 },
               3:{ span=3,131 cap=2048 },
               4:{ span=4,132 cap=2048 },
               5:{ span=5,133 cap=2048 },
               6:{ span=6,134 cap=2048 },
               7:{ span=7,135 cap=2048 }
       domain-2: span=0-255 level=PKG
        groups:  0:{ span=0-7,128-135 cap=16384 },
                 8:{ span=8-15,136-143 cap=16384 },
                16:{ span=16-23,144-151 cap=16384 },
                24:{ span=24-31,152-159 cap=16384 },
                32:{ span=32-39,160-167 cap=16384 },
                40:{ span=40-47,168-175 cap=16384 },
                48:{ span=48-55,176-183 cap=16384 },
                56:{ span=56-63,184-191 cap=16384 },
                64:{ span=64-71,192-199 cap=16384 },
                72:{ span=72-79,200-207 cap=16384 },
                80:{ span=80-87,208-215 cap=16384 },
                88:{ span=88-95,216-223 cap=16384 },
                96:{ span=96-103,224-231 cap=16384 },
               104:{ span=104-111,232-239 cap=16384 },
               112:{ span=112-119,240-247 cap=16384 },
               120:{ span=120-127,248-255 cap=16384 }
    ...
    root domain span: 0-255


The PKG domain covers both the sockets since it uses the node mask which
covers the entire system.

-- 
Thanks and Regards,
Prateek
Re: [RFC][PATCH 2/6] x86/topo: Add TOPO_NUMA_DOMAIN
Posted by Peter Zijlstra 1 month ago
On Mon, Mar 02, 2026 at 09:05:03PM +0530, K Prateek Nayak wrote:

> > I had a quick look at this NPS stuff, and that is more or less the same
> > as the intel SNC thing. With two notable exceptions:
> > 
> >  - you've stuck to power-of-two numbers (good!)
> 
> Yeah but "L3 as NUMA" on a 6CCX machines doesn't follow that :-(
> Is there any implicit dependency there?
> 
> P.S. All these configs are symmetric so those divisions should give the
> correct results.

Nah, the code here doesn't care. Specifically the case at hand was
SNC-3, where we make one package have 3 nodes.

> >  - NPS0; I don't think Intel has anything like that (although I could be
> >    mistaken).
> > 
> > Now, the __num_nodes_per_package is obviously not going to work for
> > NPS0 (it bottoms out at 1).
> > 
> > Should we look at adding something for NPS0, or has that not been needed
> > (yet) ?
> 
> Let me go boot into NPS0 to see what my machine thinks. But it shouldn't
> do any harm right because of the DIV_ROUND_UP() right?

Right, no harm. And I've since realized you can detect it by:

	num_phys_nodes() == 1 && topology_max_packages() == 2