Recent AMD node rework removed the "search and count" method of caching
AMD root devices. This depended on the value from a Data Fabric register
that was expected to hold the PCI bus of one of the root devices
attached to that fabric.
However, this expectation is incorrect. The register, when read from PCI
config space, returns the bitwise-OR of the buses of all attached root
devices.
This behavior is benign on AMD reference design boards, since the bus
numbers are aligned. This results in a bitwise-OR value matching one of
the buses. For example, 0x00 | 0x40 | 0xA0 | 0xE0 = 0xE0.
This behavior breaks on boards where the bus numbers are not exactly
aligned. For example, 0x00 | 0x07 | 0xE0 | 0x15 = 0x1F.
The bus numbering style in the reference boards is not a requirement.
The numbering found in other boards is not incorrect. Therefore, the
root device caching method needs to be adjusted.
Go back to the "search and count" method used before the recent rework.
Search for root devices using PCI class code rather than fixed PCI IDs.
This keeps the goal of the rework (remove dependency on PCI IDs) while
being able to support various board designs.
Fixes: 40a5f6ffdfc8 ("x86/amd_nb: Simplify root device search")
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Cc: stable@vger.kernel.org
---
arch/x86/include/asm/amd/node.h | 1 -
arch/x86/kernel/amd_node.c | 95 ++++++++++++++++-------------------------
2 files changed, 36 insertions(+), 60 deletions(-)
diff --git a/arch/x86/include/asm/amd/node.h b/arch/x86/include/asm/amd/node.h
index 23fe617898a8..a672b8765fa8 100644
--- a/arch/x86/include/asm/amd/node.h
+++ b/arch/x86/include/asm/amd/node.h
@@ -23,7 +23,6 @@
#define AMD_NODE0_PCI_SLOT 0x18
struct pci_dev *amd_node_get_func(u16 node, u8 func);
-struct pci_dev *amd_node_get_root(u16 node);
static inline u16 amd_num_nodes(void)
{
diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c
index a40176b62eb5..4a3d9ca6e225 100644
--- a/arch/x86/kernel/amd_node.c
+++ b/arch/x86/kernel/amd_node.c
@@ -34,62 +34,6 @@ struct pci_dev *amd_node_get_func(u16 node, u8 func)
return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func));
}
-#define DF_BLK_INST_CNT 0x040
-#define DF_CFG_ADDR_CNTL_LEGACY 0x084
-#define DF_CFG_ADDR_CNTL_DF4 0xC04
-
-#define DF_MAJOR_REVISION GENMASK(27, 24)
-
-static u16 get_cfg_addr_cntl_offset(struct pci_dev *df_f0)
-{
- u32 reg;
-
- /*
- * Revision fields added for DF4 and later.
- *
- * Major revision of '0' is found pre-DF4. Field is Read-as-Zero.
- */
- if (pci_read_config_dword(df_f0, DF_BLK_INST_CNT, ®))
- return 0;
-
- if (reg & DF_MAJOR_REVISION)
- return DF_CFG_ADDR_CNTL_DF4;
-
- return DF_CFG_ADDR_CNTL_LEGACY;
-}
-
-struct pci_dev *amd_node_get_root(u16 node)
-{
- struct pci_dev *root;
- u16 cntl_off;
- u8 bus;
-
- if (!cpu_feature_enabled(X86_FEATURE_ZEN))
- return NULL;
-
- /*
- * D18F0xXXX [Config Address Control] (DF::CfgAddressCntl)
- * Bits [7:0] (SecBusNum) holds the bus number of the root device for
- * this Data Fabric instance. The segment, device, and function will be 0.
- */
- struct pci_dev *df_f0 __free(pci_dev_put) = amd_node_get_func(node, 0);
- if (!df_f0)
- return NULL;
-
- cntl_off = get_cfg_addr_cntl_offset(df_f0);
- if (!cntl_off)
- return NULL;
-
- if (pci_read_config_byte(df_f0, cntl_off, &bus))
- return NULL;
-
- /* Grab the pointer for the actual root device instance. */
- root = pci_get_domain_bus_and_slot(0, bus, 0);
-
- pci_dbg(root, "is root for AMD node %u\n", node);
- return root;
-}
-
static struct pci_dev **amd_roots;
/* Protect the PCI config register pairs used for SMN. */
@@ -274,16 +218,49 @@ DEFINE_SHOW_STORE_ATTRIBUTE(smn_node);
DEFINE_SHOW_STORE_ATTRIBUTE(smn_address);
DEFINE_SHOW_STORE_ATTRIBUTE(smn_value);
+static struct pci_dev *get_next_root(struct pci_dev *root)
+{
+ while ((root = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, root))) {
+ /* Root device is Device 0 Function 0. */
+ if (root->devfn)
+ continue;
+
+ if (root->vendor != PCI_VENDOR_ID_AMD &&
+ root->vendor != PCI_VENDOR_ID_HYGON)
+ continue;
+
+ break;
+ }
+
+ return root;
+}
+
static int amd_cache_roots(void)
{
- u16 node, num_nodes = amd_num_nodes();
+ u16 count = 0, num_roots = 0, roots_per_node, node = 0, num_nodes = amd_num_nodes();
+ struct pci_dev *root = NULL;
amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
if (!amd_roots)
return -ENOMEM;
- for (node = 0; node < num_nodes; node++)
- amd_roots[node] = amd_node_get_root(node);
+ while ((root = get_next_root(root))) {
+ pci_dbg(root, "is an AMD root device\n");
+ num_roots++;
+ }
+
+ pr_debug("Found %d AMD root devices\n", num_roots);
+
+ roots_per_node = num_roots / num_nodes;
+
+ while ((root = get_next_root(root)) && node < num_nodes) {
+ /* Use one root for each node and skip the rest. */
+ if (count++ % roots_per_node)
+ continue;
+
+ pci_dbg(root, "is root for AMD node %u\n", node);
+ amd_roots[node++] = root;
+ }
return 0;
}
--
2.51.0
On Tue, Sep 30, 2025 at 04:45:45PM +0000, Yazen Ghannam wrote:
> This behavior is benign on AMD reference design boards, since the bus
> numbers are aligned. This results in a bitwise-OR value matching one of
> the buses. For example, 0x00 | 0x40 | 0xA0 | 0xE0 = 0xE0.
>
> This behavior breaks on boards where the bus numbers are not exactly
> aligned. For example, 0x00 | 0x07 | 0xE0 | 0x15 = 0x1F.
<---
Please add here something along the lines of:
"And even if one could say, they both have bus 0x0 containing the root
devices, this is not true on the other AMD nodes besides 0."
> static int amd_cache_roots(void)
> {
> - u16 node, num_nodes = amd_num_nodes();
> + u16 count = 0, num_roots = 0, roots_per_node, node = 0, num_nodes = amd_num_nodes();
> + struct pci_dev *root = NULL;
>
> amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
> if (!amd_roots)
> return -ENOMEM;
>
> - for (node = 0; node < num_nodes; node++)
> - amd_roots[node] = amd_node_get_root(node);
> + while ((root = get_next_root(root))) {
> + pci_dbg(root, "is an AMD root device\n");
> + num_roots++;
> + }
> +
> + pr_debug("Found %d AMD root devices\n", num_roots);
> +
> + roots_per_node = num_roots / num_nodes;
What happens if num_roots = 0? IOW, you need to handle that here.
> +
> + while ((root = get_next_root(root)) && node < num_nodes) {
> + /* Use one root for each node and skip the rest. */
> + if (count++ % roots_per_node)
> + continue;
> +
> + pci_dbg(root, "is root for AMD node %u\n", node);
> + amd_roots[node++] = root;
> + }
If I squint my eyes hard enough, I can see you getting rid of the *three*
while loops here. So please try again.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Wed, Oct 22, 2025 at 01:13:42PM +0200, Borislav Petkov wrote:
> On Tue, Sep 30, 2025 at 04:45:45PM +0000, Yazen Ghannam wrote:
> > This behavior is benign on AMD reference design boards, since the bus
> > numbers are aligned. This results in a bitwise-OR value matching one of
> > the buses. For example, 0x00 | 0x40 | 0xA0 | 0xE0 = 0xE0.
> >
> > This behavior breaks on boards where the bus numbers are not exactly
> > aligned. For example, 0x00 | 0x07 | 0xE0 | 0x15 = 0x1F.
>
> <---
>
> Please add here something along the lines of:
>
> "And even if one could say, they both have bus 0x0 containing the root
> devices, this is not true on the other AMD nodes besides 0."
>
Okay, will do.
> > static int amd_cache_roots(void)
> > {
> > - u16 node, num_nodes = amd_num_nodes();
> > + u16 count = 0, num_roots = 0, roots_per_node, node = 0, num_nodes = amd_num_nodes();
> > + struct pci_dev *root = NULL;
> >
> > amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
> > if (!amd_roots)
> > return -ENOMEM;
> >
> > - for (node = 0; node < num_nodes; node++)
> > - amd_roots[node] = amd_node_get_root(node);
> > + while ((root = get_next_root(root))) {
> > + pci_dbg(root, "is an AMD root device\n");
> > + num_roots++;
> > + }
> > +
> > + pr_debug("Found %d AMD root devices\n", num_roots);
> > +
> > + roots_per_node = num_roots / num_nodes;
>
> What happens if num_roots = 0? IOW, you need to handle that here.
Yes, will do.
>
> > +
> > + while ((root = get_next_root(root)) && node < num_nodes) {
> > + /* Use one root for each node and skip the rest. */
> > + if (count++ % roots_per_node)
> > + continue;
> > +
> > + pci_dbg(root, "is root for AMD node %u\n", node);
> > + amd_roots[node++] = root;
> > + }
>
> If I squint my eyes hard enough, I can see you getting rid of the *three*
> while loops here. So please try again.
>
I don't follow. Do you mean to combine the other loops into this one? Or
that this loop should be expanded into three loops explicitly doing one
thing each?
Thanks,
Yazen
On Wed, Oct 22, 2025 at 09:19:04AM -0400, Yazen Ghannam wrote:
> I don't follow. Do you mean to combine the other loops into this one? Or
> that this loop should be expanded into three loops explicitly doing one
> thing each?
See if you can do everything in one loop. I.e., limit the amount of looping to
the least possible.
:-)
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Tue, Sep 30, 2025 at 04:45:45PM +0000, Yazen Ghannam wrote:
> Recent AMD node rework removed the "search and count" method of caching
> AMD root devices. This depended on the value from a Data Fabric register
> that was expected to hold the PCI bus of one of the root devices
> attached to that fabric.
>
> However, this expectation is incorrect. The register, when read from PCI
> config space, returns the bitwise-OR of the buses of all attached root
> devices.
>
> This behavior is benign on AMD reference design boards, since the bus
> numbers are aligned. This results in a bitwise-OR value matching one of
> the buses. For example, 0x00 | 0x40 | 0xA0 | 0xE0 = 0xE0.
>
> This behavior breaks on boards where the bus numbers are not exactly
> aligned. For example, 0x00 | 0x07 | 0xE0 | 0x15 = 0x1F.
Do I see it correctly that one of the root device's PCI bus is always 0x0 so
you can simply read that one and you can keep the current code?
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Tue, Oct 21, 2025 at 02:29:09PM +0200, Borislav Petkov wrote: > On Tue, Sep 30, 2025 at 04:45:45PM +0000, Yazen Ghannam wrote: > > Recent AMD node rework removed the "search and count" method of caching > > AMD root devices. This depended on the value from a Data Fabric register > > that was expected to hold the PCI bus of one of the root devices > > attached to that fabric. > > > > However, this expectation is incorrect. The register, when read from PCI > > config space, returns the bitwise-OR of the buses of all attached root > > devices. > > > > This behavior is benign on AMD reference design boards, since the bus > > numbers are aligned. This results in a bitwise-OR value matching one of > > the buses. For example, 0x00 | 0x40 | 0xA0 | 0xE0 = 0xE0. > > > > This behavior breaks on boards where the bus numbers are not exactly > > aligned. For example, 0x00 | 0x07 | 0xE0 | 0x15 = 0x1F. > > Do I see it correctly that one of the root device's PCI bus is always 0x0 so > you can simply read that one and you can keep the current code? > It's correct that one of the root devices will be on bus 0x0. But that device will be part of AMD node 0. So we still need to pick a root device for the other, if any, AMD nodes in the system. For example, a system with 2 AMD nodes could have this: Node 0 : 0x00 0x07 0x0e 0x15 Node 1 : 0x1c 0x23 0x2a 0x31 Thanks, Yazen
On Tue, Oct 21, 2025 at 09:37:45AM -0400, Yazen Ghannam wrote:
> It's correct that one of the root devices will be on bus 0x0. But that
> device will be part of AMD node 0. So we still need to pick a root
> device for the other, if any, AMD nodes in the system.
>
> For example, a system with 2 AMD nodes could have this:
> Node 0 : 0x00 0x07 0x0e 0x15
> Node 1 : 0x1c 0x23 0x2a 0x31
Aha, so next question: isn't there a way to figure out which is the *first*
PCI bus of a AMD node? My gut feeling tells me there should be some info
somewhere which we can use...
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Tue, Oct 21, 2025 at 04:15:00PM +0200, Borislav Petkov wrote: > On Tue, Oct 21, 2025 at 09:37:45AM -0400, Yazen Ghannam wrote: > > It's correct that one of the root devices will be on bus 0x0. But that > > device will be part of AMD node 0. So we still need to pick a root > > device for the other, if any, AMD nodes in the system. > > > > For example, a system with 2 AMD nodes could have this: > > Node 0 : 0x00 0x07 0x0e 0x15 > > Node 1 : 0x1c 0x23 0x2a 0x31 > > Aha, so next question: isn't there a way to figure out which is the *first* > PCI bus of a AMD node? My gut feeling tells me there should be some info > somewhere which we can use... > Yep, that was one of the goals of the recent rework. I thought the DF register held that info, but I was mistaken. I haven't been able to find anything else. We can keep looking, but I wanted to send out this patch as a fix in the meantime. Thanks, Yazen
On 9/30/2025 11:45 AM, Yazen Ghannam wrote:
> Recent AMD node rework removed the "search and count" method of caching
> AMD root devices. This depended on the value from a Data Fabric register
> that was expected to hold the PCI bus of one of the root devices
> attached to that fabric.
>
> However, this expectation is incorrect. The register, when read from PCI
> config space, returns the bitwise-OR of the buses of all attached root
> devices.
>
> This behavior is benign on AMD reference design boards, since the bus
> numbers are aligned. This results in a bitwise-OR value matching one of
> the buses. For example, 0x00 | 0x40 | 0xA0 | 0xE0 = 0xE0.
>
> This behavior breaks on boards where the bus numbers are not exactly
> aligned. For example, 0x00 | 0x07 | 0xE0 | 0x15 = 0x1F.
>
> The bus numbering style in the reference boards is not a requirement.
> The numbering found in other boards is not incorrect. Therefore, the
> root device caching method needs to be adjusted.
>
> Go back to the "search and count" method used before the recent rework.
> Search for root devices using PCI class code rather than fixed PCI IDs.
>
> This keeps the goal of the rework (remove dependency on PCI IDs) while
> being able to support various board designs.
>
> Fixes: 40a5f6ffdfc8 ("x86/amd_nb: Simplify root device search")
Was this a publicly reported failure?
If so is there a link to LKML or a Bugzilla with the details of the
failure you can include here?
> Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
> Cc: stable@vger.kernel.org
> ---
> arch/x86/include/asm/amd/node.h | 1 -
> arch/x86/kernel/amd_node.c | 95 ++++++++++++++++-------------------------
> 2 files changed, 36 insertions(+), 60 deletions(-)
>
> diff --git a/arch/x86/include/asm/amd/node.h b/arch/x86/include/asm/amd/node.h
> index 23fe617898a8..a672b8765fa8 100644
> --- a/arch/x86/include/asm/amd/node.h
> +++ b/arch/x86/include/asm/amd/node.h
> @@ -23,7 +23,6 @@
> #define AMD_NODE0_PCI_SLOT 0x18
>
> struct pci_dev *amd_node_get_func(u16 node, u8 func);
> -struct pci_dev *amd_node_get_root(u16 node);
>
> static inline u16 amd_num_nodes(void)
> {
> diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c
> index a40176b62eb5..4a3d9ca6e225 100644
> --- a/arch/x86/kernel/amd_node.c
> +++ b/arch/x86/kernel/amd_node.c
> @@ -34,62 +34,6 @@ struct pci_dev *amd_node_get_func(u16 node, u8 func)
> return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func));
> }
>
> -#define DF_BLK_INST_CNT 0x040
> -#define DF_CFG_ADDR_CNTL_LEGACY 0x084
> -#define DF_CFG_ADDR_CNTL_DF4 0xC04
> -
> -#define DF_MAJOR_REVISION GENMASK(27, 24)
> -
> -static u16 get_cfg_addr_cntl_offset(struct pci_dev *df_f0)
> -{
> - u32 reg;
> -
> - /*
> - * Revision fields added for DF4 and later.
> - *
> - * Major revision of '0' is found pre-DF4. Field is Read-as-Zero.
> - */
> - if (pci_read_config_dword(df_f0, DF_BLK_INST_CNT, ®))
> - return 0;
> -
> - if (reg & DF_MAJOR_REVISION)
> - return DF_CFG_ADDR_CNTL_DF4;
> -
> - return DF_CFG_ADDR_CNTL_LEGACY;
> -}
> -
> -struct pci_dev *amd_node_get_root(u16 node)
> -{
> - struct pci_dev *root;
> - u16 cntl_off;
> - u8 bus;
> -
> - if (!cpu_feature_enabled(X86_FEATURE_ZEN))
> - return NULL;
> -
> - /*
> - * D18F0xXXX [Config Address Control] (DF::CfgAddressCntl)
> - * Bits [7:0] (SecBusNum) holds the bus number of the root device for
> - * this Data Fabric instance. The segment, device, and function will be 0.
> - */
> - struct pci_dev *df_f0 __free(pci_dev_put) = amd_node_get_func(node, 0);
> - if (!df_f0)
> - return NULL;
> -
> - cntl_off = get_cfg_addr_cntl_offset(df_f0);
> - if (!cntl_off)
> - return NULL;
> -
> - if (pci_read_config_byte(df_f0, cntl_off, &bus))
> - return NULL;
> -
> - /* Grab the pointer for the actual root device instance. */
> - root = pci_get_domain_bus_and_slot(0, bus, 0);
> -
> - pci_dbg(root, "is root for AMD node %u\n", node);
> - return root;
> -}
> -
> static struct pci_dev **amd_roots;
>
> /* Protect the PCI config register pairs used for SMN. */
> @@ -274,16 +218,49 @@ DEFINE_SHOW_STORE_ATTRIBUTE(smn_node);
> DEFINE_SHOW_STORE_ATTRIBUTE(smn_address);
> DEFINE_SHOW_STORE_ATTRIBUTE(smn_value);
>
> +static struct pci_dev *get_next_root(struct pci_dev *root)
> +{
> + while ((root = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, root))) {
> + /* Root device is Device 0 Function 0. */
> + if (root->devfn)
> + continue;
> +
> + if (root->vendor != PCI_VENDOR_ID_AMD &&
> + root->vendor != PCI_VENDOR_ID_HYGON)
> + continue;
> +
> + break;
> + }
> +
> + return root;
> +}
> +
> static int amd_cache_roots(void)
> {
> - u16 node, num_nodes = amd_num_nodes();
> + u16 count = 0, num_roots = 0, roots_per_node, node = 0, num_nodes = amd_num_nodes();
> + struct pci_dev *root = NULL;
>
> amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
> if (!amd_roots)
> return -ENOMEM;
>
> - for (node = 0; node < num_nodes; node++)
> - amd_roots[node] = amd_node_get_root(node);
> + while ((root = get_next_root(root))) {
> + pci_dbg(root, "is an AMD root device\n");
> + num_roots++;
> + }
> +
> + pr_debug("Found %d AMD root devices\n", num_roots);
> +
> + roots_per_node = num_roots / num_nodes;
> +
> + while ((root = get_next_root(root)) && node < num_nodes) {
> + /* Use one root for each node and skip the rest. */
> + if (count++ % roots_per_node)
> + continue;
> +
> + pci_dbg(root, "is root for AMD node %u\n", node);
> + amd_roots[node++] = root;
> + }
>
> return 0;
> }
>
On Tue, Sep 30, 2025 at 01:07:47PM -0500, Mario Limonciello (AMD) (kernel.org) wrote:
>
>
> On 9/30/2025 11:45 AM, Yazen Ghannam wrote:
> > Recent AMD node rework removed the "search and count" method of caching
> > AMD root devices. This depended on the value from a Data Fabric register
> > that was expected to hold the PCI bus of one of the root devices
> > attached to that fabric.
> >
> > However, this expectation is incorrect. The register, when read from PCI
> > config space, returns the bitwise-OR of the buses of all attached root
> > devices.
> >
> > This behavior is benign on AMD reference design boards, since the bus
> > numbers are aligned. This results in a bitwise-OR value matching one of
> > the buses. For example, 0x00 | 0x40 | 0xA0 | 0xE0 = 0xE0.
> >
> > This behavior breaks on boards where the bus numbers are not exactly
> > aligned. For example, 0x00 | 0x07 | 0xE0 | 0x15 = 0x1F.
> >
> > The bus numbering style in the reference boards is not a requirement.
> > The numbering found in other boards is not incorrect. Therefore, the
> > root device caching method needs to be adjusted.
> >
> > Go back to the "search and count" method used before the recent rework.
> > Search for root devices using PCI class code rather than fixed PCI IDs.
> >
> > This keeps the goal of the rework (remove dependency on PCI IDs) while
> > being able to support various board designs.
> >
> > Fixes: 40a5f6ffdfc8 ("x86/amd_nb: Simplify root device search")
>
> Was this a publicly reported failure?
>
> If so is there a link to LKML or a Bugzilla with the details of the failure
> you can include here?
>
No, it was reported off-list.
Thanks,
Yazen
© 2016 - 2026 Red Hat, Inc.