arch/x86/kernel/cpu/amd.c | 74 ------------------------------------ arch/x86/kernel/cpu/common.c | 74 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/hygon.c | 73 ----------------------------------- arch/x86/kernel/cpu/intel.c | 17 --------- include/linux/topology.h | 1 + 5 files changed, 75 insertions(+), 164 deletions(-)
This change is provoked by an observed warning after
commit 717b64d58cff ("x86/topo: Replace x86_has_numa_in_package")
when faking numa nodes on intel.
For example:
qemu-system-x86_64 \
-kernel arch/x86/boot/bzImage \
-append "console=ttyS0 root=/dev/sda debug numa=fake=2" \
-hda $IMAGES/unstable.img \
-cpu qemu64,vendor=GenuineIntel \
-nographic \
-m 2G \
-smp 2 \
Will trigger:
[ 0.066755][ T0] ------------[ cut here ]------------
[ 0.066755][ T0] WARNING: arch/x86/kernel/smpboot.c:698 at
set_cpu_sibling_map+0xe41/0x1f90, CPU#1: swapper/1/0
[ 0.066755][ T0] Call Trace:
[ 0.066755][ T0] <TASK>
[ 0.066755][ T0] ap_starting+0x9e/0x140
[ 0.066755][ T0] ? __pfx_ap_starting+0x10/0x10
[ 0.066755][ T0] ? fpu__init_cpu_xstate+0x5c/0x320
[ 0.066755][ T0] start_secondary+0x66/0x110
[ 0.066755][ T0] common_startup_64+0x13e/0x147
[ 0.066755][ T0] </TASK>
smpboot.c suggests that the topology is invalid as
the CPUs are in the same package but different nodes.
Fix this by unifying the srat_detect_node function
among amd/intel/hygon and taking the amd/hygon approach
of falling back to LLC when SRAT is not detected.
Place the function inside common.c and expose it in topology.h
The hygon code is already basically identical to amd
except for the way it obtains the LLC ID.
We can reuse that from the hygon code since we
already have the struct cpuinfo_x86 passed to us.
Signed-off-by: Nikola Z. Ivanov <zlatistiv@gmail.com>
---
This is marked RFC as I lack the context for the reason
why the intel code looks the way it does. I can see
it went through a few changes in the 2008-2010 year range,
which makes be believe that the comment regarding
"not doing AMD heuristics for now" is long overdue.
Also is a merge like this even desired in the first place?
Any feedback is appreciated!
arch/x86/kernel/cpu/amd.c | 74 ------------------------------------
arch/x86/kernel/cpu/common.c | 74 ++++++++++++++++++++++++++++++++++++
arch/x86/kernel/cpu/hygon.c | 73 -----------------------------------
arch/x86/kernel/cpu/intel.c | 17 ---------
include/linux/topology.h | 1 +
5 files changed, 75 insertions(+), 164 deletions(-)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 09de584e4c8f..7a4c804e6836 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -276,80 +276,6 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
#endif
}
-#ifdef CONFIG_NUMA
-/*
- * To workaround broken NUMA config. Read the comment in
- * srat_detect_node().
- */
-static int nearby_node(int apicid)
-{
- int i, node;
-
- for (i = apicid - 1; i >= 0; i--) {
- node = __apicid_to_node[i];
- if (node != NUMA_NO_NODE && node_online(node))
- return node;
- }
- for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
- node = __apicid_to_node[i];
- if (node != NUMA_NO_NODE && node_online(node))
- return node;
- }
- return first_node(node_online_map); /* Shouldn't happen */
-}
-#endif
-
-static void srat_detect_node(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_NUMA
- int cpu = smp_processor_id();
- int node;
- unsigned apicid = c->topo.apicid;
-
- node = numa_cpu_node(cpu);
- if (node == NUMA_NO_NODE)
- node = per_cpu_llc_id(cpu);
-
- /*
- * On multi-fabric platform (e.g. Numascale NumaChip) a
- * platform-specific handler needs to be called to fixup some
- * IDs of the CPU.
- */
- if (x86_cpuinit.fixup_cpu_id)
- x86_cpuinit.fixup_cpu_id(c, node);
-
- if (!node_online(node)) {
- /*
- * Two possibilities here:
- *
- * - The CPU is missing memory and no node was created. In
- * that case try picking one from a nearby CPU.
- *
- * - The APIC IDs differ from the HyperTransport node IDs
- * which the K8 northbridge parsing fills in. Assume
- * they are all increased by a constant offset, but in
- * the same order as the HT nodeids. If that doesn't
- * result in a usable node fall back to the path for the
- * previous case.
- *
- * This workaround operates directly on the mapping between
- * APIC ID and NUMA node, assuming certain relationship
- * between APIC ID, HT node ID and NUMA topology. As going
- * through CPU mapping may alter the outcome, directly
- * access __apicid_to_node[].
- */
- int ht_nodeid = c->topo.initial_apicid;
-
- if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
- node = __apicid_to_node[ht_nodeid];
- /* Pick a nearby node */
- if (!node_online(node))
- node = nearby_node(apicid);
- }
- numa_set_node(cpu, node);
-#endif
-}
-
static void bsp_determine_snp(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a8ff4376c286..05fcfa7a5cb5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2496,6 +2496,80 @@ void cpu_init(void)
load_fixmap_gdt(cpu);
}
+#ifdef CONFIG_NUMA
+/*
+ * To workaround broken NUMA config. Read the comment in
+ * srat_detect_node().
+ */
+static int nearby_node(int apicid)
+{
+ int i, node;
+
+ for (i = apicid - 1; i >= 0; i--) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+void srat_detect_node(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_NUMA
+ int cpu = smp_processor_id();
+ int node;
+ unsigned int apicid = c->topo.apicid;
+
+ node = numa_cpu_node(cpu);
+ if (node == NUMA_NO_NODE)
+ node = c->topo.llc_id;
+
+ /*
+ * On multi-fabric platform (e.g. Numascale NumaChip) a
+ * platform-specific handler needs to be called to fixup some
+ * IDs of the CPU.
+ */
+ if (x86_cpuinit.fixup_cpu_id)
+ x86_cpuinit.fixup_cpu_id(c, node);
+
+ if (!node_online(node)) {
+ /*
+ * Two possibilities here:
+ *
+ * - The CPU is missing memory and no node was created. In
+ * that case try picking one from a nearby CPU.
+ *
+ * - The APIC IDs differ from the HyperTransport node IDs
+ * which the K8 northbridge parsing fills in. Assume
+ * they are all increased by a constant offset, but in
+ * the same order as the HT nodeids. If that doesn't
+ * result in a usable node fall back to the path for the
+ * previous case.
+ *
+ * This workaround operates directly on the mapping between
+ * APIC ID and NUMA node, assuming certain relationship
+ * between APIC ID, HT node ID and NUMA topology. As going
+ * through CPU mapping may alter the outcome, directly
+ * access __apicid_to_node[].
+ */
+ int ht_nodeid = c->topo.initial_apicid;
+
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ node = __apicid_to_node[ht_nodeid];
+ /* Pick a nearby node */
+ if (!node_online(node))
+ node = nearby_node(apicid);
+ }
+ numa_set_node(cpu, node);
+#endif
+}
+
#ifdef CONFIG_MICROCODE_LATE_LOADING
/**
* store_cpu_caps() - Store a snapshot of CPU capabilities
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 7f95a74e4c65..a33735094843 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -20,79 +20,6 @@
#include "cpu.h"
-#ifdef CONFIG_NUMA
-/*
- * To workaround broken NUMA config. Read the comment in
- * srat_detect_node().
- */
-static int nearby_node(int apicid)
-{
- int i, node;
-
- for (i = apicid - 1; i >= 0; i--) {
- node = __apicid_to_node[i];
- if (node != NUMA_NO_NODE && node_online(node))
- return node;
- }
- for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
- node = __apicid_to_node[i];
- if (node != NUMA_NO_NODE && node_online(node))
- return node;
- }
- return first_node(node_online_map); /* Shouldn't happen */
-}
-#endif
-
-static void srat_detect_node(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_NUMA
- int cpu = smp_processor_id();
- int node;
- unsigned int apicid = c->topo.apicid;
-
- node = numa_cpu_node(cpu);
- if (node == NUMA_NO_NODE)
- node = c->topo.llc_id;
-
- /*
- * On multi-fabric platform (e.g. Numascale NumaChip) a
- * platform-specific handler needs to be called to fixup some
- * IDs of the CPU.
- */
- if (x86_cpuinit.fixup_cpu_id)
- x86_cpuinit.fixup_cpu_id(c, node);
-
- if (!node_online(node)) {
- /*
- * Two possibilities here:
- *
- * - The CPU is missing memory and no node was created. In
- * that case try picking one from a nearby CPU.
- *
- * - The APIC IDs differ from the HyperTransport node IDs.
- * Assume they are all increased by a constant offset, but
- * in the same order as the HT nodeids. If that doesn't
- * result in a usable node fall back to the path for the
- * previous case.
- *
- * This workaround operates directly on the mapping between
- * APIC ID and NUMA node, assuming certain relationship
- * between APIC ID, HT node ID and NUMA topology. As going
- * through CPU mapping may alter the outcome, directly
- * access __apicid_to_node[].
- */
- int ht_nodeid = c->topo.initial_apicid;
-
- if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
- node = __apicid_to_node[ht_nodeid];
- /* Pick a nearby node */
- if (!node_online(node))
- node = nearby_node(apicid);
- }
- numa_set_node(cpu, node);
-#endif
-}
-
static void bsp_init_hygon(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 646ff33c4651..12eeacb0de4b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -467,23 +467,6 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
}
#endif
-static void srat_detect_node(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_NUMA
- unsigned node;
- int cpu = smp_processor_id();
-
- /* Don't do the funky fallback heuristics the AMD version employs
- for now. */
- node = numa_cpu_node(cpu);
- if (node == NUMA_NO_NODE || !node_online(node)) {
- /* reuse the value from init_cpu_to_node() */
- node = cpu_to_node(cpu);
- }
- numa_set_node(cpu, node);
-#endif
-}
-
static void init_cpuid_fault(struct cpuinfo_x86 *c)
{
u64 msr;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 6575af39fd10..9f71ad8a6983 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -41,6 +41,7 @@
#endif
int arch_update_cpu_topology(void);
+void srat_detect_node(struct cpuinfo_x86 *c);
/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE 10
--
2.53.0
Hello Nikola,
On 3/29/2026 5:38 PM, Nikola Z. Ivanov wrote:
> This change is provoked by an observed warning after
> commit 717b64d58cff ("x86/topo: Replace x86_has_numa_in_package")
> when faking numa nodes on intel.
>
> For example:
>
> qemu-system-x86_64 \
> -kernel arch/x86/boot/bzImage \
> -append "console=ttyS0 root=/dev/sda debug numa=fake=2" \
> -hda $IMAGES/unstable.img \
> -cpu qemu64,vendor=GenuineIntel \
> -nographic \
> -m 2G \
> -smp 2 \
You can also say:
-smp 2,sockets=2
and that fixes the warning but that is not a valid solution? Why?
>
> Will trigger:
>
> [ 0.066755][ T0] ------------[ cut here ]------------
> [ 0.066755][ T0] WARNING: arch/x86/kernel/smpboot.c:698 at
> set_cpu_sibling_map+0xe41/0x1f90, CPU#1: swapper/1/0
> [ 0.066755][ T0] Call Trace:
> [ 0.066755][ T0] <TASK>
> [ 0.066755][ T0] ap_starting+0x9e/0x140
> [ 0.066755][ T0] ? __pfx_ap_starting+0x10/0x10
> [ 0.066755][ T0] ? fpu__init_cpu_xstate+0x5c/0x320
> [ 0.066755][ T0] start_secondary+0x66/0x110
> [ 0.066755][ T0] common_startup_64+0x13e/0x147
> [ 0.066755][ T0] </TASK>
>
> smpboot.c suggests that the topology is invalid as
> the CPUs are in the same package but different nodes.
To me, that looks like a broken topology from a virtualization use case
and the user can easily go fix their QEMU cmdlline if they care. I'm
pretty sure folks using NUMA emulation in production know what they are
doing.
>
> Fix this by unifying the srat_detect_node function
> among amd/intel/hygon and taking the amd/hygon approach
> of falling back to LLC when SRAT is not detected.
As far as the AMD, Hygon unification goes, I don't mind that but
someone has to confirm if nearby_node() holds for all APICID
distribution on Intel.
> Place the function inside common.c and expose it in topology.h
There is no need to make it visible out of arch/x86/kernel/cpu/
Perhaps arch/x86/kernel/cpu/cpu/cpu.h?
>
> The hygon code is already basically identical to amd
> except for the way it obtains the LLC ID.
> We can reuse that from the hygon code since we
> already have the struct cpuinfo_x86 passed to us.
>
> Signed-off-by: Nikola Z. Ivanov <zlatistiv@gmail.com>
> ---
> This is marked RFC as I lack the context for the reason
> why the intel code looks the way it does. I can see
> it went through a few changes in the 2008-2010 year range,
> which makes be believe that the comment regarding
> "not doing AMD heuristics for now" is long overdue.
So prior to you patch, If I launch:
-smp 4,sockets=2,cores=2
and "numa=fake=2", the srat_detect_node() for an Intel VM maps:
CPU#0 -> Node#0
CPU#1 -> Node#1
CPU#2 -> Node#0
CPU#3 -> Node#1
Which resembles Intel baremetal node assignments where the CPUs
are interleaved. After your patch, it does:
CPU#0 -> Node#0
CPU#1 -> Node#0
CPU#2 -> Node#0
CPU#3 -> Node#0
So despite there being 2 LLCs, the Node assignments all go to Node#0
which may have other unintended consequences.
The statement "falling back to LLC when SRAT is not detected." isn't
accurate right? We have 2 LLCs and 2 Nodes but topology bits associate
both LLCs to the same node.
I'm all for unifying the AMD and Hygon's srat_detect_node() but unifying
all three for an obviously broken use-case isn't a good motivation.
I'll let others comment since they are more familiar with the NUMA
emulation bits and maybe all this is acceptable.
--
Thanks and Regards,
Prateek
On 3/30/26 7:57 AM, K Prateek Nayak wrote:
> Hello Nikola,
>
> On 3/29/2026 5:38 PM, Nikola Z. Ivanov wrote:
>> This change is provoked by an observed warning after
>> commit 717b64d58cff ("x86/topo: Replace x86_has_numa_in_package")
>> when faking numa nodes on intel.
>>
>> For example:
>>
>> qemu-system-x86_64 \
>> -kernel arch/x86/boot/bzImage \
>> -append "console=ttyS0 root=/dev/sda debug numa=fake=2" \
>> -hda $IMAGES/unstable.img \
>> -cpu qemu64,vendor=GenuineIntel \
>> -nographic \
>> -m 2G \
>> -smp 2 \
> You can also say:
>
> -smp 2,sockets=2
>
> and that fixes the warning but that is not a valid solution? Why?
>
>> Will trigger:
>>
>> [ 0.066755][ T0] ------------[ cut here ]------------
>> [ 0.066755][ T0] WARNING: arch/x86/kernel/smpboot.c:698 at
>> set_cpu_sibling_map+0xe41/0x1f90, CPU#1: swapper/1/0
>> [ 0.066755][ T0] Call Trace:
>> [ 0.066755][ T0] <TASK>
>> [ 0.066755][ T0] ap_starting+0x9e/0x140
>> [ 0.066755][ T0] ? __pfx_ap_starting+0x10/0x10
>> [ 0.066755][ T0] ? fpu__init_cpu_xstate+0x5c/0x320
>> [ 0.066755][ T0] start_secondary+0x66/0x110
>> [ 0.066755][ T0] common_startup_64+0x13e/0x147
>> [ 0.066755][ T0] </TASK>
>>
>> smpboot.c suggests that the topology is invalid as
>> the CPUs are in the same package but different nodes.
> To me, that looks like a broken topology from a virtualization use case
> and the user can easily go fix their QEMU cmdlline if they care. I'm
> pretty sure folks using NUMA emulation in production know what they are
> doing.
>
>> Fix this by unifying the srat_detect_node function
>> among amd/intel/hygon and taking the amd/hygon approach
>> of falling back to LLC when SRAT is not detected.
> As far as the AMD, Hygon unification goes, I don't mind that but
> someone has to confirm if nearby_node() holds for all APICID
> distribution on Intel.
>
>> Place the function inside common.c and expose it in topology.h
> There is no need to make it visible out of arch/x86/kernel/cpu/
> Perhaps arch/x86/kernel/cpu/cpu/cpu.h?
Yes, I have made a pretty bad mistake here
also because topology.h is not x86 specific
and will cause build warnings.
>> The hygon code is already basically identical to amd
>> except for the way it obtains the LLC ID.
>> We can reuse that from the hygon code since we
>> already have the struct cpuinfo_x86 passed to us.
>>
>> Signed-off-by: Nikola Z. Ivanov <zlatistiv@gmail.com>
>> ---
>> This is marked RFC as I lack the context for the reason
>> why the intel code looks the way it does. I can see
>> it went through a few changes in the 2008-2010 year range,
>> which makes be believe that the comment regarding
>> "not doing AMD heuristics for now" is long overdue.
> So prior to you patch, If I launch:
>
> -smp 4,sockets=2,cores=2
>
> and "numa=fake=2", the srat_detect_node() for an Intel VM maps:
>
> CPU#0 -> Node#0
> CPU#1 -> Node#1
> CPU#2 -> Node#0
> CPU#3 -> Node#1
This is not necessarily correct either.
Those qemu parameters do not produce
an interleaved topology, but the cpu to node
map will end up interleaved, this ties back to the warning in smpboot.c
This is what it looks like pre-patch:
cpu to node mapping (interleaved):
# ls -l /sys/devices/system/cpu/cpu*/node*
lrwxrwxrwx 1 root root 0 Apr 1 13:57
/sys/devices/system/cpu/cpu0/node0 -> ../../node/node0
lrwxrwxrwx 1 root root 0 Apr 1 13:57
/sys/devices/system/cpu/cpu1/node1 -> ../../node/node1
lrwxrwxrwx 1 root root 0 Apr 1 13:57
/sys/devices/system/cpu/cpu2/node0 -> ../../node/node0
lrwxrwxrwx 1 root root 0 Apr 1 13:57
/sys/devices/system/cpu/cpu3/node1 -> ../../node/node1
#
cpu to socket (not interleaved):
# lscpu -e
CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE
0 0 0 0 0:0:0:0 yes
1 0 0 1 1:1:1:0 yes
2 0 1 2 2:2:2:1 yes
3 0 1 3 3:3:3:1 yes
#
The NODE output of "lscpu -e" is kind of bogus,
as it doesn't look up the cpu to node mapping, but instead
reads the node to cpu map and figures out the reverse,
which caused me to make a lot of false assumptions earlier...
However, this is an unrelated matter and the SOCKET is correct.
The early initialization code first assigns nodes in round-robin fashion,
which is later overridden on AMD by srat_detect_node,
but persists for Intel.
>
> Which resembles Intel baremetal node assignments where the CPUs
> are interleaved. After your patch, it does:
>
> CPU#0 -> Node#0
> CPU#1 -> Node#0
> CPU#2 -> Node#0
> CPU#3 -> Node#0
This happens because we take an interesting path in srat_detect_node
when the patch is applied.
The cpuid to llc_id map looks like this:
0 -> 0
1 -> 0
2 -> 2
3 -> 2
Since node 2 does not exist, we enter the if(!node_online(node))
path and our final mapping ends up like this:
0 -> 0
1 -> 0
2 -> 0
3 -> 0
> So despite there being 2 LLCs, the Node assignments all go to Node#0
> which may have other unintended consequences.
>
> The statement "falling back to LLC when SRAT is not detected." isn't
> accurate right? We have 2 LLCs and 2 Nodes but topology bits associate
> both LLCs to the same node.
>
> I'm all for unifying the AMD and Hygon's srat_detect_node() but unifying
> all three for an obviously broken use-case isn't a good motivation.
>
> I'll let others comment since they are more familiar with the NUMA
> emulation bits and maybe all this is acceptable.
>
Hi Prateek,
Thank you for the feedback!
I have tried to dig a bit deeper
and left my findings in response.
I suppose I will wait to see if someone
chimes in with some more details, if not
I will do as you suggest and try to come up with
a good way to unify amd/hygon and leave intel as is.
© 2016 - 2026 Red Hat, Inc.