[v1] (v)PCI: extended capability handling

[PATCH 1/6] PCI: determine whether a device has extended config space

Posted by Jan Beulich 1 month ago

Legacy PCI devices don't have any extended config space. Reading any part
thereof may return all ones or other arbitrary data, e.g. in some cases
base config space contents repeatedly.

Logic follows Linux 6.19-rc's pci_cfg_space_size(), albeit leveraging our
determination of device type; in particular some comments are taken
verbatim from there.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Note that alloc_pdev()'s switch doesn't handle DEV_TYPE_PCI2PCIe_BRIDGE at
all. Such bridges will therefore not have ->ext_cfg set (which is likely
wrong). Shouldn't we handle them like DEV_TYPE_LEGACY_PCI_BRIDGE (or
DEV_TYPE_PCI?) anyway (just like VT-d's set_msi_source_id() does)?

Linux also has CONFIG_PCI_QUIRKS, allowing to compile out the slightly
risky code (as reads may in principle have side effects). Should we gain
such, too?

--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -310,6 +310,41 @@ static void apply_quirks(struct pci_dev
              * from trying to size the BARs or add handlers to trap accesses.
              */
             pdev->ignore_bars = true;
+
+    if ( pdev->ext_cfg )
+    {
+        unsigned int pos;
+
+        /*
+         * PCI Express to PCI/PCI-X Bridge Specification, rev 1.0, 4.1.4 says
+         * that when forwarding a type1 configuration request the bridge must
+         * check that the extended register address field is zero.  The bridge
+         * is not permitted to forward the transactions and must handle it as
+         * an Unsupported Request.  Some bridges do not follow this rule and
+         * simply drop the extended register bits, resulting in the standard
+         * config space being aliased, every 256 bytes across the entire
+         * configuration space.  Test for this condition by comparing the first
+         * dword of each potential alias to the vendor/device ID.
+         * Known offenders:
+         *   ASM1083/1085 PCIe-to-PCI Reversible Bridge (1b21:1080, rev 01 & 03)
+         *   AMD/ATI SBx00 PCI to PCI Bridge (1002:4384, rev 40)
+         */
+        for ( pos = PCI_CFG_SPACE_SIZE;
+              pos < PCI_CFG_SPACE_EXP_SIZE; pos += PCI_CFG_SPACE_SIZE )
+        {
+            if ( pci_conf_read16(pdev->sbdf, pos) != vendor ||
+                 pci_conf_read16(pdev->sbdf, pos + 2) != device )
+                break;
+        }
+
+        if ( pos >= PCI_CFG_SPACE_EXP_SIZE )
+        {
+            printk(XENLOG_WARNING
+                   "%pp: extended config space aliases base one\n",
+                   &pdev->sbdf);
+            pdev->ext_cfg = false;
+        }
+    }
 }
 
 static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
@@ -351,6 +386,8 @@ static struct pci_dev *alloc_pdev(struct
         unsigned long flags;
 
         case DEV_TYPE_PCIe2PCI_BRIDGE:
+            pdev->ext_cfg = true;
+            fallthrough;
         case DEV_TYPE_LEGACY_PCI_BRIDGE:
             sec_bus = pci_conf_read8(pdev->sbdf, PCI_SECONDARY_BUS);
             sub_bus = pci_conf_read8(pdev->sbdf, PCI_SUBORDINATE_BUS);
@@ -363,9 +400,19 @@ static struct pci_dev *alloc_pdev(struct
                 pseg->bus2bridge[sec_bus].devfn = devfn;
             }
             spin_unlock_irqrestore(&pseg->bus2bridge_lock, flags);
+
+            fallthrough;
+        case DEV_TYPE_PCI:
+            pos = pci_find_cap_offset(pdev->sbdf, PCI_CAP_ID_PCIX);
+            if ( pos &&
+                 (pci_conf_read32(pdev->sbdf, pos + PCI_X_STATUS) &
+                  (PCI_X_STATUS_266MHZ | PCI_X_STATUS_533MHZ)) )
+                pdev->ext_cfg = true;
             break;
 
         case DEV_TYPE_PCIe_ENDPOINT:
+            pdev->ext_cfg = true;
+
             pos = pci_find_cap_offset(pdev->sbdf, PCI_CAP_ID_EXP);
             BUG_ON(!pos);
             cap = pci_conf_read16(pdev->sbdf, pos + PCI_EXP_DEVCAP);
@@ -409,9 +456,9 @@ static struct pci_dev *alloc_pdev(struct
             }
             break;
 
-        case DEV_TYPE_PCI:
         case DEV_TYPE_PCIe_BRIDGE:
         case DEV_TYPE_PCI_HOST_BRIDGE:
+            pdev->ext_cfg = true;
             break;
 
         default:
@@ -420,6 +467,19 @@ static struct pci_dev *alloc_pdev(struct
             break;
     }
 
+    if ( pdev->ext_cfg &&
+         /*
+          * Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express
+          * devices have 4096 bytes.  Even if the device is capable, that
+          * doesn't mean we can access it.  Maybe we don't have a way to
+          * generate extended config space accesses, or the device is behind a
+          * reverse Express bridge.  So we try reading the dword at
+          * PCI_CFG_SPACE_SIZE which must either be 0 or a valid extended
+          * capability header.
+          */
+         pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) == 0xffffffffU )
+        pdev->ext_cfg = false;
+
     apply_quirks(pdev);
     check_pdev(pdev);
 
@@ -717,6 +777,11 @@ int pci_add_device(u16 seg, u8 bus, u8 d
 
                 list_add(&pdev->vf_list, &pf_pdev->vf_list);
             }
+
+            if ( !pdev->ext_cfg )
+                printk(XENLOG_WARNING
+                       "%pp: VF without extended config space?\n",
+                       &pdev->sbdf);
         }
     }
 
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -126,6 +126,9 @@ struct pci_dev {
 
     nodeid_t node; /* NUMA node */
 
+    /* Whether the device has extended config space. */
+    bool ext_cfg;
+
     /* Device to be quarantined, don't automatically re-assign to dom0 */
     bool quarantine;

Re: [PATCH 1/6] PCI: determine whether a device has extended config space

Posted by Stewart Hildebrand 3 weeks, 6 days ago

On 1/6/26 08:47, Jan Beulich wrote:
> Legacy PCI devices don't have any extended config space. Reading any part
> thereof may return all ones or other arbitrary data, e.g. in some cases
> base config space contents repeatedly.
> 
> Logic follows Linux 6.19-rc's pci_cfg_space_size(), albeit leveraging our
> determination of device type; in particular some comments are taken
> verbatim from there.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> Note that alloc_pdev()'s switch doesn't handle DEV_TYPE_PCI2PCIe_BRIDGE at
> all. Such bridges will therefore not have ->ext_cfg set (which is likely
> wrong).

I initially read "set" as in "set to true", but I think you mean that ext_cfg
isn't assigned at all. Though perhaps it should actually be set to true,
because ...

> Shouldn't we handle them like DEV_TYPE_LEGACY_PCI_BRIDGE (or
> DEV_TYPE_PCI?) anyway (just like VT-d's set_msi_source_id() does)?

... in pdev_type(), we will only reach DEV_TYPE_PCI2PCIe_BRIDGE if it has
PCI_CAP_ID_EXP, which would indicate the device has extended config. So maybe it
would be better to handle it similar to DEV_TYPE_PCIe2PCI_BRIDGE in
alloc_pdev().

> Linux also has CONFIG_PCI_QUIRKS, allowing to compile out the slightly
> risky code (as reads may in principle have side effects). Should we gain
> such, too?

I don't have a strong opinion here.

> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -310,6 +310,41 @@ static void apply_quirks(struct pci_dev
>               * from trying to size the BARs or add handlers to trap accesses.
>               */
>              pdev->ignore_bars = true;
> +
> +    if ( pdev->ext_cfg )
> +    {
> +        unsigned int pos;
> +
> +        /*
> +         * PCI Express to PCI/PCI-X Bridge Specification, rev 1.0, 4.1.4 says
> +         * that when forwarding a type1 configuration request the bridge must
> +         * check that the extended register address field is zero.  The bridge
> +         * is not permitted to forward the transactions and must handle it as
> +         * an Unsupported Request.  Some bridges do not follow this rule and
> +         * simply drop the extended register bits, resulting in the standard
> +         * config space being aliased, every 256 bytes across the entire
> +         * configuration space.  Test for this condition by comparing the first
> +         * dword of each potential alias to the vendor/device ID.
> +         * Known offenders:
> +         *   ASM1083/1085 PCIe-to-PCI Reversible Bridge (1b21:1080, rev 01 & 03)
> +         *   AMD/ATI SBx00 PCI to PCI Bridge (1002:4384, rev 40)
> +         */
> +        for ( pos = PCI_CFG_SPACE_SIZE;
> +              pos < PCI_CFG_SPACE_EXP_SIZE; pos += PCI_CFG_SPACE_SIZE )
> +        {
> +            if ( pci_conf_read16(pdev->sbdf, pos) != vendor ||
> +                 pci_conf_read16(pdev->sbdf, pos + 2) != device )
> +                break;
> +        }
> +
> +        if ( pos >= PCI_CFG_SPACE_EXP_SIZE )
> +        {
> +            printk(XENLOG_WARNING
> +                   "%pp: extended config space aliases base one\n",
> +                   &pdev->sbdf);
> +            pdev->ext_cfg = false;
> +        }
> +    }
>  }
>  
>  static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
> @@ -351,6 +386,8 @@ static struct pci_dev *alloc_pdev(struct
>          unsigned long flags;
>  
>          case DEV_TYPE_PCIe2PCI_BRIDGE:
> +            pdev->ext_cfg = true;
> +            fallthrough;
>          case DEV_TYPE_LEGACY_PCI_BRIDGE:
>              sec_bus = pci_conf_read8(pdev->sbdf, PCI_SECONDARY_BUS);
>              sub_bus = pci_conf_read8(pdev->sbdf, PCI_SUBORDINATE_BUS);
> @@ -363,9 +400,19 @@ static struct pci_dev *alloc_pdev(struct
>                  pseg->bus2bridge[sec_bus].devfn = devfn;
>              }
>              spin_unlock_irqrestore(&pseg->bus2bridge_lock, flags);
> +
> +            fallthrough;
> +        case DEV_TYPE_PCI:
> +            pos = pci_find_cap_offset(pdev->sbdf, PCI_CAP_ID_PCIX);
> +            if ( pos &&
> +                 (pci_conf_read32(pdev->sbdf, pos + PCI_X_STATUS) &
> +                  (PCI_X_STATUS_266MHZ | PCI_X_STATUS_533MHZ)) )
> +                pdev->ext_cfg = true;
>              break;
>  
>          case DEV_TYPE_PCIe_ENDPOINT:
> +            pdev->ext_cfg = true;
> +
>              pos = pci_find_cap_offset(pdev->sbdf, PCI_CAP_ID_EXP);
>              BUG_ON(!pos);
>              cap = pci_conf_read16(pdev->sbdf, pos + PCI_EXP_DEVCAP);
> @@ -409,9 +456,9 @@ static struct pci_dev *alloc_pdev(struct
>              }
>              break;
>  
> -        case DEV_TYPE_PCI:
>          case DEV_TYPE_PCIe_BRIDGE:
>          case DEV_TYPE_PCI_HOST_BRIDGE:
> +            pdev->ext_cfg = true;
>              break;
>  
>          default:
> @@ -420,6 +467,19 @@ static struct pci_dev *alloc_pdev(struct
>              break;
>      }
>  
> +    if ( pdev->ext_cfg &&
> +         /*
> +          * Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express
> +          * devices have 4096 bytes.  Even if the device is capable, that
> +          * doesn't mean we can access it.  Maybe we don't have a way to
> +          * generate extended config space accesses, or the device is behind a
> +          * reverse Express bridge.  So we try reading the dword at
> +          * PCI_CFG_SPACE_SIZE which must either be 0 or a valid extended
> +          * capability header.
> +          */
> +         pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) == 0xffffffffU )
> +        pdev->ext_cfg = false;

I'm using a machine where
xen/arch/x86/x86_64/mmconfig-shared.c:is_mmconf_reserved() will initially return
false during Xen boot:

(XEN) PCI: MCFG configuration 0: base f0000000 segment 0000 buses 00 - 3f
(XEN) PCI: Not using MCFG for segment 0000 bus 00-3f

Then, during dom0 boot, dom0 will call PHYSDEVOP_pci_mmcfg_reserved, after which
MCFG becomes enabled in Xen:

(XEN) PCI: Using MCFG for segment 0000 bus 00-3f

On such machines where mmcfg/ECAM is initially disabled, this will effectively
set ->ext_cfg to false for all devices discovered at Xen boot.

I'm not really sure if I have any good suggestions, but perhaps we could add a
macro or small function that returns something like
   ( pdev->ext_cfg && pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) != 0xffffffffU )
to allow this checking to happen dynamically (but this still wouldn't handle the
aliasing quirk). Maybe re-run the ext_cfg detection logic at the end of
PHYSDEVOP_pci_mmcfg_reserved?

Regardless, I'd be happy to provide my R-b without this addressed, but I am
curious if others think this as an issue?

> +
>      apply_quirks(pdev);
>      check_pdev(pdev);
>  
> @@ -717,6 +777,11 @@ int pci_add_device(u16 seg, u8 bus, u8 d
>  
>                  list_add(&pdev->vf_list, &pf_pdev->vf_list);
>              }
> +
> +            if ( !pdev->ext_cfg )
> +                printk(XENLOG_WARNING
> +                       "%pp: VF without extended config space?\n",
> +                       &pdev->sbdf);
>          }
>      }
>  
> --- a/xen/include/xen/pci.h
> +++ b/xen/include/xen/pci.h
> @@ -126,6 +126,9 @@ struct pci_dev {
>  
>      nodeid_t node; /* NUMA node */
>  
> +    /* Whether the device has extended config space. */

Nit: it would be nice to clearly state if this means the extended config is
accessible, or whether the device merely has it (but might not be accessible).

> +    bool ext_cfg;
> +
>      /* Device to be quarantined, don't automatically re-assign to dom0 */
>      bool quarantine;
>  
>

Re: [PATCH 1/6] PCI: determine whether a device has extended config space

Posted by Jan Beulich 3 weeks, 5 days ago

On 12.01.2026 22:07, Stewart Hildebrand wrote:
> On 1/6/26 08:47, Jan Beulich wrote:
>> ---
>> Note that alloc_pdev()'s switch doesn't handle DEV_TYPE_PCI2PCIe_BRIDGE at
>> all. Such bridges will therefore not have ->ext_cfg set (which is likely
>> wrong).
> 
> I initially read "set" as in "set to true", but I think you mean that ext_cfg
> isn't assigned at all.

Both are the same really, due to alloc_pdev() using xzalloc().

> Though perhaps it should actually be set to true, because ...
> 
>> Shouldn't we handle them like DEV_TYPE_LEGACY_PCI_BRIDGE (or
>> DEV_TYPE_PCI?) anyway (just like VT-d's set_msi_source_id() does)?
> 
> ... in pdev_type(), we will only reach DEV_TYPE_PCI2PCIe_BRIDGE if it has
> PCI_CAP_ID_EXP, which would indicate the device has extended config. So maybe it
> would be better to handle it similar to DEV_TYPE_PCIe2PCI_BRIDGE in
> alloc_pdev().

Hence my question. Since apparently you agree, I'll make that change. Maybe
in a separate, prereq patch.

>> @@ -420,6 +467,19 @@ static struct pci_dev *alloc_pdev(struct
>>              break;
>>      }
>>  
>> +    if ( pdev->ext_cfg &&
>> +         /*
>> +          * Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express
>> +          * devices have 4096 bytes.  Even if the device is capable, that
>> +          * doesn't mean we can access it.  Maybe we don't have a way to
>> +          * generate extended config space accesses, or the device is behind a
>> +          * reverse Express bridge.  So we try reading the dword at
>> +          * PCI_CFG_SPACE_SIZE which must either be 0 or a valid extended
>> +          * capability header.
>> +          */
>> +         pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) == 0xffffffffU )
>> +        pdev->ext_cfg = false;
> 
> I'm using a machine where
> xen/arch/x86/x86_64/mmconfig-shared.c:is_mmconf_reserved() will initially return
> false during Xen boot:
> 
> (XEN) PCI: MCFG configuration 0: base f0000000 segment 0000 buses 00 - 3f
> (XEN) PCI: Not using MCFG for segment 0000 bus 00-3f
> 
> Then, during dom0 boot, dom0 will call PHYSDEVOP_pci_mmcfg_reserved, after which
> MCFG becomes enabled in Xen:
> 
> (XEN) PCI: Using MCFG for segment 0000 bus 00-3f
> 
> On such machines where mmcfg/ECAM is initially disabled, this will effectively
> set ->ext_cfg to false for all devices discovered at Xen boot.
> 
> I'm not really sure if I have any good suggestions, but perhaps we could add a
> macro or small function that returns something like
>    ( pdev->ext_cfg && pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) != 0xffffffffU )
> to allow this checking to happen dynamically (but this still wouldn't handle the
> aliasing quirk). Maybe re-run the ext_cfg detection logic at the end of
> PHYSDEVOP_pci_mmcfg_reserved?
> 
> Regardless, I'd be happy to provide my R-b without this addressed, but I am
> curious if others think this as an issue?

Hmm, no, I forgot to consider that case (which in principle I'm well aware of).
Will need to fix in v2.

>> --- a/xen/include/xen/pci.h
>> +++ b/xen/include/xen/pci.h
>> @@ -126,6 +126,9 @@ struct pci_dev {
>>  
>>      nodeid_t node; /* NUMA node */
>>  
>> +    /* Whether the device has extended config space. */
> 
> Nit: it would be nice to clearly state if this means the extended config is
> accessible, or whether the device merely has it (but might not be accessible).

Well. Would the indicator be of any use if it wasn't accessible? Because if it
isn't accessible, we may not even be certain it exists. But yes, I can surely
make it "... has (accessible) extended ...".

Jan

Re: [PATCH 1/6] PCI: determine whether a device has extended config space

Posted by Jan Beulich 3 weeks, 3 days ago

On 13.01.2026 11:46, Jan Beulich wrote:
> On 12.01.2026 22:07, Stewart Hildebrand wrote:
>> On 1/6/26 08:47, Jan Beulich wrote:
>>> @@ -420,6 +467,19 @@ static struct pci_dev *alloc_pdev(struct
>>>              break;
>>>      }
>>>  
>>> +    if ( pdev->ext_cfg &&
>>> +         /*
>>> +          * Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express
>>> +          * devices have 4096 bytes.  Even if the device is capable, that
>>> +          * doesn't mean we can access it.  Maybe we don't have a way to
>>> +          * generate extended config space accesses, or the device is behind a
>>> +          * reverse Express bridge.  So we try reading the dword at
>>> +          * PCI_CFG_SPACE_SIZE which must either be 0 or a valid extended
>>> +          * capability header.
>>> +          */
>>> +         pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) == 0xffffffffU )
>>> +        pdev->ext_cfg = false;
>>
>> I'm using a machine where
>> xen/arch/x86/x86_64/mmconfig-shared.c:is_mmconf_reserved() will initially return
>> false during Xen boot:
>>
>> (XEN) PCI: MCFG configuration 0: base f0000000 segment 0000 buses 00 - 3f
>> (XEN) PCI: Not using MCFG for segment 0000 bus 00-3f
>>
>> Then, during dom0 boot, dom0 will call PHYSDEVOP_pci_mmcfg_reserved, after which
>> MCFG becomes enabled in Xen:
>>
>> (XEN) PCI: Using MCFG for segment 0000 bus 00-3f
>>
>> On such machines where mmcfg/ECAM is initially disabled, this will effectively
>> set ->ext_cfg to false for all devices discovered at Xen boot.
>>
>> I'm not really sure if I have any good suggestions, but perhaps we could add a
>> macro or small function that returns something like
>>    ( pdev->ext_cfg && pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) != 0xffffffffU )
>> to allow this checking to happen dynamically (but this still wouldn't handle the
>> aliasing quirk). Maybe re-run the ext_cfg detection logic at the end of
>> PHYSDEVOP_pci_mmcfg_reserved?
>>
>> Regardless, I'd be happy to provide my R-b without this addressed, but I am
>> curious if others think this as an issue?
> 
> Hmm, no, I forgot to consider that case (which in principle I'm well aware of).
> Will need to fix in v2.

My reply yesterday was actually not quite sufficient. On a system like yours,
isn't it the case that PVH Dom0 then also doesn't work quite right (yet), due
to parts of vPCI depending on extended config space accesses now? All of what
we presently do during boot, and which requires extended config space access,
would need re-doing once extended config space access becomes available (or
goes away) for a (sub)set of devices.

Jan

Re: [PATCH 1/6] PCI: determine whether a device has extended config space

Posted by Stewart Hildebrand 3 weeks, 3 days ago

On 1/15/26 05:43, Jan Beulich wrote:
> On 13.01.2026 11:46, Jan Beulich wrote:
>> On 12.01.2026 22:07, Stewart Hildebrand wrote:
>>> On 1/6/26 08:47, Jan Beulich wrote:
>>>> @@ -420,6 +467,19 @@ static struct pci_dev *alloc_pdev(struct
>>>>              break;
>>>>      }
>>>>  
>>>> +    if ( pdev->ext_cfg &&
>>>> +         /*
>>>> +          * Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express
>>>> +          * devices have 4096 bytes.  Even if the device is capable, that
>>>> +          * doesn't mean we can access it.  Maybe we don't have a way to
>>>> +          * generate extended config space accesses, or the device is behind a
>>>> +          * reverse Express bridge.  So we try reading the dword at
>>>> +          * PCI_CFG_SPACE_SIZE which must either be 0 or a valid extended
>>>> +          * capability header.
>>>> +          */
>>>> +         pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) == 0xffffffffU )
>>>> +        pdev->ext_cfg = false;
>>>
>>> I'm using a machine where
>>> xen/arch/x86/x86_64/mmconfig-shared.c:is_mmconf_reserved() will initially return
>>> false during Xen boot:
>>>
>>> (XEN) PCI: MCFG configuration 0: base f0000000 segment 0000 buses 00 - 3f
>>> (XEN) PCI: Not using MCFG for segment 0000 bus 00-3f
>>>
>>> Then, during dom0 boot, dom0 will call PHYSDEVOP_pci_mmcfg_reserved, after which
>>> MCFG becomes enabled in Xen:
>>>
>>> (XEN) PCI: Using MCFG for segment 0000 bus 00-3f
>>>
>>> On such machines where mmcfg/ECAM is initially disabled, this will effectively
>>> set ->ext_cfg to false for all devices discovered at Xen boot.
>>>
>>> I'm not really sure if I have any good suggestions, but perhaps we could add a
>>> macro or small function that returns something like
>>>    ( pdev->ext_cfg && pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) != 0xffffffffU )
>>> to allow this checking to happen dynamically (but this still wouldn't handle the
>>> aliasing quirk). Maybe re-run the ext_cfg detection logic at the end of
>>> PHYSDEVOP_pci_mmcfg_reserved?
>>>
>>> Regardless, I'd be happy to provide my R-b without this addressed, but I am
>>> curious if others think this as an issue?
>>
>> Hmm, no, I forgot to consider that case (which in principle I'm well aware of).
>> Will need to fix in v2.
> 
> My reply yesterday was actually not quite sufficient. On a system like yours,
> isn't it the case that PVH Dom0 then also doesn't work quite right (yet), due
> to parts of vPCI depending on extended config space accesses now? All of what
> we presently do during boot, and which requires extended config space access,
> would need re-doing once extended config space access becomes available (or
> goes away) for a (sub)set of devices.

Right, e.g. rebar will fail to initialize. One possible approach might be to do
some variation of this at the end of PHYSDEVOP_pci_mmcfg_reserved (untested):

    for_each_pdev ( hardware_domain, pdev )
        vpci_reset_device(pdev);

This would be better than nothing IMO, however, as you point out, it may only be
needed for a subset of devices.

Re: [PATCH 1/6] PCI: determine whether a device has extended config space

Posted by Jan Beulich 3 weeks, 4 days ago

On 13.01.2026 11:46, Jan Beulich wrote:
> On 12.01.2026 22:07, Stewart Hildebrand wrote:
>> On 1/6/26 08:47, Jan Beulich wrote:
>>> @@ -420,6 +467,19 @@ static struct pci_dev *alloc_pdev(struct
>>>              break;
>>>      }
>>>  
>>> +    if ( pdev->ext_cfg &&
>>> +         /*
>>> +          * Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express
>>> +          * devices have 4096 bytes.  Even if the device is capable, that
>>> +          * doesn't mean we can access it.  Maybe we don't have a way to
>>> +          * generate extended config space accesses, or the device is behind a
>>> +          * reverse Express bridge.  So we try reading the dword at
>>> +          * PCI_CFG_SPACE_SIZE which must either be 0 or a valid extended
>>> +          * capability header.
>>> +          */
>>> +         pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) == 0xffffffffU )
>>> +        pdev->ext_cfg = false;
>>
>> I'm using a machine where
>> xen/arch/x86/x86_64/mmconfig-shared.c:is_mmconf_reserved() will initially return
>> false during Xen boot:
>>
>> (XEN) PCI: MCFG configuration 0: base f0000000 segment 0000 buses 00 - 3f
>> (XEN) PCI: Not using MCFG for segment 0000 bus 00-3f
>>
>> Then, during dom0 boot, dom0 will call PHYSDEVOP_pci_mmcfg_reserved, after which
>> MCFG becomes enabled in Xen:
>>
>> (XEN) PCI: Using MCFG for segment 0000 bus 00-3f
>>
>> On such machines where mmcfg/ECAM is initially disabled, this will effectively
>> set ->ext_cfg to false for all devices discovered at Xen boot.
>>
>> I'm not really sure if I have any good suggestions, but perhaps we could add a
>> macro or small function that returns something like
>>    ( pdev->ext_cfg && pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) != 0xffffffffU )
>> to allow this checking to happen dynamically (but this still wouldn't handle the
>> aliasing quirk). Maybe re-run the ext_cfg detection logic at the end of
>> PHYSDEVOP_pci_mmcfg_reserved?
>>
>> Regardless, I'd be happy to provide my R-b without this addressed, but I am
>> curious if others think this as an issue?
> 
> Hmm, no, I forgot to consider that case (which in principle I'm well aware of).
> Will need to fix in v2.

That'll be a little interesting, since per Dom0's request we may also lose
MCFG access to a range of busses. Looks like we indeed need to fully re-
evaluate ->ext_cfg.

Jan

Re: [PATCH 1/6] PCI: determine whether a device has extended config space

Posted by Teddy Astie 1 month ago

Le 06/01/2026 à 14:50, Jan Beulich a écrit :
> Legacy PCI devices don't have any extended config space. Reading any part
> thereof may return all ones or other arbitrary data, e.g. in some cases
> base config space contents repeatedly.
> 
> Logic follows Linux 6.19-rc's pci_cfg_space_size(), albeit leveraging our
> determination of device type; in particular some comments are taken
> verbatim from there.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> Note that alloc_pdev()'s switch doesn't handle DEV_TYPE_PCI2PCIe_BRIDGE at
> all. Such bridges will therefore not have ->ext_cfg set (which is likely
> wrong). Shouldn't we handle them like DEV_TYPE_LEGACY_PCI_BRIDGE (or
> DEV_TYPE_PCI?) anyway (just like VT-d's set_msi_source_id() does)?
> 
> Linux also has CONFIG_PCI_QUIRKS, allowing to compile out the slightly
> risky code (as reads may in principle have side effects). Should we gain
> such, too?
> 
> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -310,6 +310,41 @@ static void apply_quirks(struct pci_dev
>                * from trying to size the BARs or add handlers to trap accesses.
>                */
>               pdev->ignore_bars = true;
> +
> +    if ( pdev->ext_cfg )
> +    {
> +        unsigned int pos;
> +
> +        /*
> +         * PCI Express to PCI/PCI-X Bridge Specification, rev 1.0, 4.1.4 says
> +         * that when forwarding a type1 configuration request the bridge must
> +         * check that the extended register address field is zero.  The bridge
> +         * is not permitted to forward the transactions and must handle it as
> +         * an Unsupported Request.  Some bridges do not follow this rule and
> +         * simply drop the extended register bits, resulting in the standard
> +         * config space being aliased, every 256 bytes across the entire
> +         * configuration space.  Test for this condition by comparing the first
> +         * dword of each potential alias to the vendor/device ID.
> +         * Known offenders:
> +         *   ASM1083/1085 PCIe-to-PCI Reversible Bridge (1b21:1080, rev 01 & 03)
> +         *   AMD/ATI SBx00 PCI to PCI Bridge (1002:4384, rev 40)
> +         */
> +        for ( pos = PCI_CFG_SPACE_SIZE;
> +              pos < PCI_CFG_SPACE_EXP_SIZE; pos += PCI_CFG_SPACE_SIZE )
> +        {
> +            if ( pci_conf_read16(pdev->sbdf, pos) != vendor ||
> +                 pci_conf_read16(pdev->sbdf, pos + 2) != device )
> +                break;
> +        }
> +
> +        if ( pos >= PCI_CFG_SPACE_EXP_SIZE )
> +        {
> +            printk(XENLOG_WARNING
> +                   "%pp: extended config space aliases base one\n",
> +                   &pdev->sbdf);
> +            pdev->ext_cfg = false;
> +        }
> +    }

Given that it only appears to be the case for PCIe to PCI/PCI-X bridges, 
do we want to check this for all devices ?

>   }
>   
>   static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
> @@ -351,6 +386,8 @@ static struct pci_dev *alloc_pdev(struct
>           unsigned long flags;
>   
>           case DEV_TYPE_PCIe2PCI_BRIDGE:
> +            pdev->ext_cfg = true;
> +            fallthrough;
>           case DEV_TYPE_LEGACY_PCI_BRIDGE:
>               sec_bus = pci_conf_read8(pdev->sbdf, PCI_SECONDARY_BUS);
>               sub_bus = pci_conf_read8(pdev->sbdf, PCI_SUBORDINATE_BUS);
> @@ -363,9 +400,19 @@ static struct pci_dev *alloc_pdev(struct
>                   pseg->bus2bridge[sec_bus].devfn = devfn;
>               }
>               spin_unlock_irqrestore(&pseg->bus2bridge_lock, flags);
> +
> +            fallthrough;
> +        case DEV_TYPE_PCI:
> +            pos = pci_find_cap_offset(pdev->sbdf, PCI_CAP_ID_PCIX);
> +            if ( pos &&
> +                 (pci_conf_read32(pdev->sbdf, pos + PCI_X_STATUS) &
> +                  (PCI_X_STATUS_266MHZ | PCI_X_STATUS_533MHZ)) )
> +                pdev->ext_cfg = true;
>               break;
>   
>           case DEV_TYPE_PCIe_ENDPOINT:
> +            pdev->ext_cfg = true;
> +
>               pos = pci_find_cap_offset(pdev->sbdf, PCI_CAP_ID_EXP);
>               BUG_ON(!pos);
>               cap = pci_conf_read16(pdev->sbdf, pos + PCI_EXP_DEVCAP);
> @@ -409,9 +456,9 @@ static struct pci_dev *alloc_pdev(struct
>               }
>               break;
>   
> -        case DEV_TYPE_PCI:
>           case DEV_TYPE_PCIe_BRIDGE:
>           case DEV_TYPE_PCI_HOST_BRIDGE:
> +            pdev->ext_cfg = true;
>               break;
>   
>           default:
> @@ -420,6 +467,19 @@ static struct pci_dev *alloc_pdev(struct
>               break;
>       }
>   
> +    if ( pdev->ext_cfg &&
> +         /*
> +          * Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express
> +          * devices have 4096 bytes.  Even if the device is capable, that
> +          * doesn't mean we can access it.  Maybe we don't have a way to
> +          * generate extended config space accesses, or the device is behind a
> +          * reverse Express bridge.  So we try reading the dword at
> +          * PCI_CFG_SPACE_SIZE which must either be 0 or a valid extended
> +          * capability header.
> +          */
> +         pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) == 0xffffffffU )
> +        pdev->ext_cfg = false;
> +
>       apply_quirks(pdev);
>       check_pdev(pdev);
>   
> @@ -717,6 +777,11 @@ int pci_add_device(u16 seg, u8 bus, u8 d
>   
>                   list_add(&pdev->vf_list, &pf_pdev->vf_list);
>               }
> +
> +            if ( !pdev->ext_cfg )
> +                printk(XENLOG_WARNING
> +                       "%pp: VF without extended config space?\n",
> +                       &pdev->sbdf);
>           }
>       }
>   
> --- a/xen/include/xen/pci.h
> +++ b/xen/include/xen/pci.h
> @@ -126,6 +126,9 @@ struct pci_dev {
>   
>       nodeid_t node; /* NUMA node */
>   
> +    /* Whether the device has extended config space. */
> +    bool ext_cfg;
> +
>       /* Device to be quarantined, don't automatically re-assign to dom0 */
>       bool quarantine;
>   
> 
> 



--
Teddy Astie | Vates XCP-ng Developer

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech

Re: [PATCH 1/6] PCI: determine whether a device has extended config space

Posted by Jan Beulich 1 month ago

On 07.01.2026 11:00, Teddy Astie wrote:
> Le 06/01/2026 à 14:50, Jan Beulich a écrit :
>> --- a/xen/drivers/passthrough/pci.c
>> +++ b/xen/drivers/passthrough/pci.c
>> @@ -310,6 +310,41 @@ static void apply_quirks(struct pci_dev
>>                * from trying to size the BARs or add handlers to trap accesses.
>>                */
>>               pdev->ignore_bars = true;
>> +
>> +    if ( pdev->ext_cfg )
>> +    {
>> +        unsigned int pos;
>> +
>> +        /*
>> +         * PCI Express to PCI/PCI-X Bridge Specification, rev 1.0, 4.1.4 says
>> +         * that when forwarding a type1 configuration request the bridge must
>> +         * check that the extended register address field is zero.  The bridge
>> +         * is not permitted to forward the transactions and must handle it as
>> +         * an Unsupported Request.  Some bridges do not follow this rule and
>> +         * simply drop the extended register bits, resulting in the standard
>> +         * config space being aliased, every 256 bytes across the entire
>> +         * configuration space.  Test for this condition by comparing the first
>> +         * dword of each potential alias to the vendor/device ID.
>> +         * Known offenders:
>> +         *   ASM1083/1085 PCIe-to-PCI Reversible Bridge (1b21:1080, rev 01 & 03)
>> +         *   AMD/ATI SBx00 PCI to PCI Bridge (1002:4384, rev 40)
>> +         */
>> +        for ( pos = PCI_CFG_SPACE_SIZE;
>> +              pos < PCI_CFG_SPACE_EXP_SIZE; pos += PCI_CFG_SPACE_SIZE )
>> +        {
>> +            if ( pci_conf_read16(pdev->sbdf, pos) != vendor ||
>> +                 pci_conf_read16(pdev->sbdf, pos + 2) != device )
>> +                break;
>> +        }
>> +
>> +        if ( pos >= PCI_CFG_SPACE_EXP_SIZE )
>> +        {
>> +            printk(XENLOG_WARNING
>> +                   "%pp: extended config space aliases base one\n",
>> +                   &pdev->sbdf);
>> +            pdev->ext_cfg = false;
>> +        }
>> +    }
> 
> Given that it only appears to be the case for PCIe to PCI/PCI-X bridges, 
> do we want to check this for all devices ?

Yes. Bridges are only the main examples, but in the few systems I tested this
on so far I have at least one ordinary device which also aliases base config
space. Also note that Linux, where the base logic is taken from, also doesn't
restrict this checking (if I'm reading the code right).

Jan

[PATCH 1/6] PCI: determine whether a device has extended config space
[PATCH 2/6] PCI: pass pdev to pci_ats_{device,enabled}()
[PATCH 3/6] x86/MSI: pass pdev to read_pci_mem_bar()
[PATCH 4/6] PCI: pass pdev to pci_find_{,next_}ext_capability()
[PATCH 5/6] PCI: don't look for ext-caps when there's no extended cfg space
[PATCH 6/6] vPCI/DomU: really no ext-caps without extended config space