[PATCH v4 5/5] vPCI: re-init extended-capabilities when MMCFG availability changed

Jan Beulich posted 5 patches 3 months, 3 weeks ago
[PATCH v4 5/5] vPCI: re-init extended-capabilities when MMCFG availability changed
Posted by Jan Beulich 3 months, 3 weeks ago
When Dom0 informs us about MMCFG usability, this may change whether
extended capabilities are available (accessible) for devices. Zap what
might be on record, and re-initialize things.

No synchronization is added for the case where devices may already be in
use. That'll need sorting when (a) DomU support was added and (b) DomU-s
may run already while Dom0 / hwdom still boots (dom0less, Hyperlaunch).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
vpci_reinit_ext_capabilities()'es return value isn't checked, as it
doesn't feel quite right to fail the hypercall because of this. At the
same time it also doesn't feel quite right to have the function return
"void". Thoughts?
---
v4: Make sure ->cleanup() and ->init() are invoked.
v3: New.

--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -8,6 +8,8 @@
 #include <xen/guest_access.h>
 #include <xen/iocap.h>
 #include <xen/serial.h>
+#include <xen/vpci.h>
+
 #include <asm/current.h>
 #include <asm/io_apic.h>
 #include <asm/msi.h>
@@ -169,7 +171,10 @@ int cf_check physdev_check_pci_extcfg(st
 
     ASSERT(pdev->seg == info->segment);
     if ( pdev->bus >= info->start_bus && pdev->bus <= info->end_bus )
+    {
         pci_check_extcfg(pdev);
+        vpci_reinit_ext_capabilities(pdev);
+    }
 
     return 0;
 }
--- a/xen/drivers/vpci/cap.c
+++ b/xen/drivers/vpci/cap.c
@@ -285,13 +285,16 @@ static int vpci_init_ext_capability_list
     return 0;
 }
 
-int vpci_init_capabilities(struct pci_dev *pdev)
+int vpci_init_capabilities(struct pci_dev *pdev, bool ext_only)
 {
     int rc;
 
-    rc = vpci_init_capability_list(pdev);
-    if ( rc )
-        return rc;
+    if ( !ext_only )
+    {
+        rc = vpci_init_capability_list(pdev);
+        if ( rc )
+            return rc;
+    }
 
     rc = vpci_init_ext_capability_list(pdev);
     if ( rc )
@@ -305,7 +308,7 @@ int vpci_init_capabilities(struct pci_de
         unsigned int pos = 0;
 
         if ( !is_ext )
-            pos = pci_find_cap_offset(pdev->sbdf, cap);
+            pos = !ext_only ? pci_find_cap_offset(pdev->sbdf, cap) : 0;
         else if ( is_hardware_domain(pdev->domain) )
             pos = pci_find_ext_capability(pdev, cap);
 
@@ -349,7 +352,7 @@ int vpci_init_capabilities(struct pci_de
     return 0;
 }
 
-void vpci_cleanup_capabilities(struct pci_dev *pdev)
+void vpci_cleanup_capabilities(struct pci_dev *pdev, bool ext_only)
 {
     for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
     {
@@ -361,7 +364,7 @@ void vpci_cleanup_capabilities(struct pc
             continue;
 
         if ( !capability->is_ext )
-            pos = pci_find_cap_offset(pdev->sbdf, cap);
+            pos = !ext_only ? pci_find_cap_offset(pdev->sbdf, cap) : 0;
         else if ( is_hardware_domain(pdev->domain) )
             pos = pci_find_ext_capability(pdev, cap);
         if ( pos )
@@ -376,6 +379,20 @@ void vpci_cleanup_capabilities(struct pc
     }
 }
 
+int vpci_reinit_ext_capabilities(struct pci_dev *pdev)
+{
+    if ( !pdev->vpci )
+        return 0;
+
+    vpci_cleanup_capabilities(pdev, true);
+
+    if ( vpci_remove_registers(pdev->vpci, PCI_CFG_SPACE_SIZE,
+                               PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) )
+        ASSERT_UNREACHABLE();
+
+    return vpci_init_capabilities(pdev, true);
+}
+
 /*
  * Local variables:
  * mode: C
--- a/xen/drivers/vpci/private.h
+++ b/xen/drivers/vpci/private.h
@@ -44,8 +44,8 @@ typedef struct {
 #define REGISTER_VPCI_EXTCAP(name, finit, fclean) \
     REGISTER_VPCI_CAPABILITY(PCI_EXT_CAP_ID_##name, name, finit, fclean, true)
 
-int vpci_init_capabilities(struct pci_dev *pdev);
-void vpci_cleanup_capabilities(struct pci_dev *pdev);
+int vpci_init_capabilities(struct pci_dev *pdev, bool ext_only);
+void vpci_cleanup_capabilities(struct pci_dev *pdev, bool ext_only);
 
 /* Add/remove a register handler. */
 int __must_check vpci_add_register_mask(struct vpci *vpci,
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -102,7 +102,7 @@ void vpci_deassign_device(struct pci_dev
                     &pdev->domain->vpci_dev_assigned_map);
 #endif
 
-    vpci_cleanup_capabilities(pdev);
+    vpci_cleanup_capabilities(pdev, false);
 
     spin_lock(&pdev->vpci->lock);
     while ( !list_empty(&pdev->vpci->handlers) )
@@ -159,7 +159,7 @@ int vpci_assign_device(struct pci_dev *p
     if ( rc )
         goto out;
 
-    rc = vpci_init_capabilities(pdev);
+    rc = vpci_init_capabilities(pdev, false);
 
  out:
     if ( rc )
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -20,6 +20,7 @@
 #define VPCI_MAX_VIRT_DEV       (PCI_SLOT(~0) + 1)
 
 int __must_check vpci_init_header(struct pci_dev *pdev);
+int vpci_reinit_ext_capabilities(struct pci_dev *pdev);
 
 /* Assign vPCI to device by adding handlers. */
 int __must_check vpci_assign_device(struct pci_dev *pdev);
@@ -197,6 +198,11 @@ bool vpci_ecam_read(pci_sbdf_t sbdf, uns
 #else /* !CONFIG_HAS_VPCI */
 struct vpci_vcpu {};
 
+static inline int vpci_reinit_ext_capabilities(struct pci_dev *pdev)
+{
+    return 0;
+}
+
 static inline int vpci_assign_device(struct pci_dev *pdev)
 {
     return 0;
Re: [PATCH v4 5/5] vPCI: re-init extended-capabilities when MMCFG availability changed
Posted by Roger Pau Monné 3 months, 2 weeks ago
On Tue, Feb 10, 2026 at 11:55:34AM +0100, Jan Beulich wrote:
> When Dom0 informs us about MMCFG usability, this may change whether
> extended capabilities are available (accessible) for devices. Zap what
> might be on record, and re-initialize things.
> 
> No synchronization is added for the case where devices may already be in
> use. That'll need sorting when (a) DomU support was added and (b) DomU-s
> may run already while Dom0 / hwdom still boots (dom0less, Hyperlaunch).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> vpci_reinit_ext_capabilities()'es return value isn't checked, as it
> doesn't feel quite right to fail the hypercall because of this. At the
> same time it also doesn't feel quite right to have the function return
> "void". Thoughts?

For the non hardwware domain case we could deassign the device from
the domain?

And print a warning message for both cases.

> ---
> v4: Make sure ->cleanup() and ->init() are invoked.
> v3: New.
> 
> --- a/xen/arch/x86/physdev.c
> +++ b/xen/arch/x86/physdev.c
> @@ -8,6 +8,8 @@
>  #include <xen/guest_access.h>
>  #include <xen/iocap.h>
>  #include <xen/serial.h>
> +#include <xen/vpci.h>
> +
>  #include <asm/current.h>
>  #include <asm/io_apic.h>
>  #include <asm/msi.h>
> @@ -169,7 +171,10 @@ int cf_check physdev_check_pci_extcfg(st
>  
>      ASSERT(pdev->seg == info->segment);
>      if ( pdev->bus >= info->start_bus && pdev->bus <= info->end_bus )
> +    {
>          pci_check_extcfg(pdev);
> +        vpci_reinit_ext_capabilities(pdev);
> +    }
>  
>      return 0;
>  }
> --- a/xen/drivers/vpci/cap.c
> +++ b/xen/drivers/vpci/cap.c
> @@ -285,13 +285,16 @@ static int vpci_init_ext_capability_list
>      return 0;
>  }
>  
> -int vpci_init_capabilities(struct pci_dev *pdev)
> +int vpci_init_capabilities(struct pci_dev *pdev, bool ext_only)
>  {
>      int rc;
>  
> -    rc = vpci_init_capability_list(pdev);
> -    if ( rc )
> -        return rc;
> +    if ( !ext_only )
> +    {
> +        rc = vpci_init_capability_list(pdev);
> +        if ( rc )
> +            return rc;
> +    }
>  
>      rc = vpci_init_ext_capability_list(pdev);
>      if ( rc )
> @@ -305,7 +308,7 @@ int vpci_init_capabilities(struct pci_de
>          unsigned int pos = 0;
>  
>          if ( !is_ext )
> -            pos = pci_find_cap_offset(pdev->sbdf, cap);
> +            pos = !ext_only ? pci_find_cap_offset(pdev->sbdf, cap) : 0;
>          else if ( is_hardware_domain(pdev->domain) )
>              pos = pci_find_ext_capability(pdev, cap);
>  
> @@ -349,7 +352,7 @@ int vpci_init_capabilities(struct pci_de
>      return 0;
>  }
>  
> -void vpci_cleanup_capabilities(struct pci_dev *pdev)
> +void vpci_cleanup_capabilities(struct pci_dev *pdev, bool ext_only)
>  {
>      for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
>      {
> @@ -361,7 +364,7 @@ void vpci_cleanup_capabilities(struct pc
>              continue;
>  
>          if ( !capability->is_ext )
> -            pos = pci_find_cap_offset(pdev->sbdf, cap);
> +            pos = !ext_only ? pci_find_cap_offset(pdev->sbdf, cap) : 0;
>          else if ( is_hardware_domain(pdev->domain) )
>              pos = pci_find_ext_capability(pdev, cap);
>          if ( pos )
> @@ -376,6 +379,20 @@ void vpci_cleanup_capabilities(struct pc
>      }
>  }
>  
> +int vpci_reinit_ext_capabilities(struct pci_dev *pdev)
> +{
> +    if ( !pdev->vpci )
> +        return 0;
> +
> +    vpci_cleanup_capabilities(pdev, true);
> +
> +    if ( vpci_remove_registers(pdev->vpci, PCI_CFG_SPACE_SIZE,
> +                               PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) )
> +        ASSERT_UNREACHABLE();
> +
> +    return vpci_init_capabilities(pdev, true);

I wonder here, in the context here, where the device is already
assigned to a domain you likely need to take the vPCI lock to safely
perform (parts of?) the cleanup and reinit.  Otherwise you could free
capability data while it's being accessed by the handlers.

The only current extended capability (reBAR) doesn't have any internal
state to free on cleanup, so it's all safe.  But a cleanup like the
MSI(-X) ones would be racy, as they free the structure without holding
the vPCI lock.  I think we need to be careful, and possibly adjust the
cleanup functions so they can tolerate cleanup with possible
concurrent accesses.

Thanks, Roger.
Re: [PATCH v4 5/5] vPCI: re-init extended-capabilities when MMCFG availability changed
Posted by Jan Beulich 3 months, 2 weeks ago
On 20.02.2026 10:28, Roger Pau Monné wrote:
> On Tue, Feb 10, 2026 at 11:55:34AM +0100, Jan Beulich wrote:
>> When Dom0 informs us about MMCFG usability, this may change whether
>> extended capabilities are available (accessible) for devices. Zap what
>> might be on record, and re-initialize things.
>>
>> No synchronization is added for the case where devices may already be in
>> use. That'll need sorting when (a) DomU support was added and (b) DomU-s
>> may run already while Dom0 / hwdom still boots (dom0less, Hyperlaunch).
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> vpci_reinit_ext_capabilities()'es return value isn't checked, as it
>> doesn't feel quite right to fail the hypercall because of this. At the
>> same time it also doesn't feel quite right to have the function return
>> "void". Thoughts?
> 
> For the non hardwware domain case we could deassign the device from
> the domain?

Will need to check. De-assigning is generally done only from domctl context,
I think. I'm also uncertain what other things may break (in Xen or the
toolstacks) if we take away a device in such a pretty much uncontrolled way.

> And print a warning message for both cases.

Can do, albeit I'm unsure what "both" refers to - I see only ...

>> --- a/xen/arch/x86/physdev.c
>> +++ b/xen/arch/x86/physdev.c
>> @@ -8,6 +8,8 @@
>>  #include <xen/guest_access.h>
>>  #include <xen/iocap.h>
>>  #include <xen/serial.h>
>> +#include <xen/vpci.h>
>> +
>>  #include <asm/current.h>
>>  #include <asm/io_apic.h>
>>  #include <asm/msi.h>
>> @@ -169,7 +171,10 @@ int cf_check physdev_check_pci_extcfg(st
>>  
>>      ASSERT(pdev->seg == info->segment);
>>      if ( pdev->bus >= info->start_bus && pdev->bus <= info->end_bus )
>> +    {
>>          pci_check_extcfg(pdev);
>> +        vpci_reinit_ext_capabilities(pdev);
>> +    }

... this.

>> @@ -376,6 +379,20 @@ void vpci_cleanup_capabilities(struct pc
>>      }
>>  }
>>  
>> +int vpci_reinit_ext_capabilities(struct pci_dev *pdev)
>> +{
>> +    if ( !pdev->vpci )
>> +        return 0;
>> +
>> +    vpci_cleanup_capabilities(pdev, true);
>> +
>> +    if ( vpci_remove_registers(pdev->vpci, PCI_CFG_SPACE_SIZE,
>> +                               PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) )
>> +        ASSERT_UNREACHABLE();
>> +
>> +    return vpci_init_capabilities(pdev, true);
> 
> I wonder here, in the context here, where the device is already
> assigned to a domain you likely need to take the vPCI lock to safely
> perform (parts of?) the cleanup and reinit.  Otherwise you could free
> capability data while it's being accessed by the handlers.

The lock isn't recursive, so I fear we'd deadlock if it was taken here.
Furthermore this falls into "DomU support needs dealing with"; right
now we assume Dom0 tells us about its final MCFG verdict ahead of
putting devices in use. Once we need to consider devices already in
use, I think we would further need to pause the owning domain. Also ...

> The only current extended capability (reBAR) doesn't have any internal
> state to free on cleanup, so it's all safe.  But a cleanup like the
> MSI(-X) ones would be racy, as they free the structure without holding
> the vPCI lock.  I think we need to be careful, and possibly adjust the
> cleanup functions so they can tolerate cleanup with possible
> concurrent accesses.

... to cover such. (For something like MSI(-X) it might then further be
necessary to mask/disable interrupts, but hopefully we'll never have to
deal with extended capabilities that would require this.)

Jan

Re: [PATCH v4 5/5] vPCI: re-init extended-capabilities when MMCFG availability changed
Posted by Roger Pau Monné 3 months, 2 weeks ago
On Fri, Feb 20, 2026 at 11:20:52AM +0100, Jan Beulich wrote:
> On 20.02.2026 10:28, Roger Pau Monné wrote:
> > On Tue, Feb 10, 2026 at 11:55:34AM +0100, Jan Beulich wrote:
> >> When Dom0 informs us about MMCFG usability, this may change whether
> >> extended capabilities are available (accessible) for devices. Zap what
> >> might be on record, and re-initialize things.
> >>
> >> No synchronization is added for the case where devices may already be in
> >> use. That'll need sorting when (a) DomU support was added and (b) DomU-s
> >> may run already while Dom0 / hwdom still boots (dom0less, Hyperlaunch).
> >>
> >> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> >> ---
> >> vpci_reinit_ext_capabilities()'es return value isn't checked, as it
> >> doesn't feel quite right to fail the hypercall because of this. At the
> >> same time it also doesn't feel quite right to have the function return
> >> "void". Thoughts?
> > 
> > For the non hardwware domain case we could deassign the device from
> > the domain?
> 
> Will need to check. De-assigning is generally done only from domctl context,
> I think. I'm also uncertain what other things may break (in Xen or the
> toolstacks) if we take away a device in such a pretty much uncontrolled way.
> 
> > And print a warning message for both cases.
> 
> Can do, albeit I'm unsure what "both" refers to - I see only ...

Oh, I mean print a warning message for both the hardware domain and
the domU case.

> >> --- a/xen/arch/x86/physdev.c
> >> +++ b/xen/arch/x86/physdev.c
> >> @@ -8,6 +8,8 @@
> >>  #include <xen/guest_access.h>
> >>  #include <xen/iocap.h>
> >>  #include <xen/serial.h>
> >> +#include <xen/vpci.h>
> >> +
> >>  #include <asm/current.h>
> >>  #include <asm/io_apic.h>
> >>  #include <asm/msi.h>
> >> @@ -169,7 +171,10 @@ int cf_check physdev_check_pci_extcfg(st
> >>  
> >>      ASSERT(pdev->seg == info->segment);
> >>      if ( pdev->bus >= info->start_bus && pdev->bus <= info->end_bus )
> >> +    {
> >>          pci_check_extcfg(pdev);
> >> +        vpci_reinit_ext_capabilities(pdev);
> >> +    }
> 
> ... this.
> 
> >> @@ -376,6 +379,20 @@ void vpci_cleanup_capabilities(struct pc
> >>      }
> >>  }
> >>  
> >> +int vpci_reinit_ext_capabilities(struct pci_dev *pdev)
> >> +{
> >> +    if ( !pdev->vpci )
> >> +        return 0;
> >> +
> >> +    vpci_cleanup_capabilities(pdev, true);
> >> +
> >> +    if ( vpci_remove_registers(pdev->vpci, PCI_CFG_SPACE_SIZE,
> >> +                               PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) )
> >> +        ASSERT_UNREACHABLE();
> >> +
> >> +    return vpci_init_capabilities(pdev, true);
> > 
> > I wonder here, in the context here, where the device is already
> > assigned to a domain you likely need to take the vPCI lock to safely
> > perform (parts of?) the cleanup and reinit.  Otherwise you could free
> > capability data while it's being accessed by the handlers.
> 
> The lock isn't recursive, so I fear we'd deadlock if it was taken here.

Yeah, that's why I've put the "parts of" remark above, some of those
functions already take the lock themselves.

> Furthermore this falls into "DomU support needs dealing with"; right
> now we assume Dom0 tells us about its final MCFG verdict ahead of
> putting devices in use. Once we need to consider devices already in
> use, I think we would further need to pause the owning domain. Also ...

Agreed.  So that this is clear, you might add an ASSERT() that the
device is not assigned to a non-hardware domain when this reinit
happens?

> > The only current extended capability (reBAR) doesn't have any internal
> > state to free on cleanup, so it's all safe.  But a cleanup like the
> > MSI(-X) ones would be racy, as they free the structure without holding
> > the vPCI lock.  I think we need to be careful, and possibly adjust the
> > cleanup functions so they can tolerate cleanup with possible
> > concurrent accesses.
> 
> ... to cover such. (For something like MSI(-X) it might then further be
> necessary to mask/disable interrupts, but hopefully we'll never have to
> deal with extended capabilities that would require this.)

Indeed.  Something like MSI-X would be extra fun to deal with, but
sooner or later we will need to handle it if we ever plan to support
hot-unplug from domUs.

In fact, the current cleanup_msi{,x}() helpers should already remove any
interrupt bindings and free any mapped pIRQs, otherwise we rely on
the PVH hardware domain having cleaned up the device properly before it
being assigned to another domain?  (Not that you need to do it here,
just realized from your comment)

Thanks, Roger

Re: [PATCH v4 5/5] vPCI: re-init extended-capabilities when MMCFG availability changed
Posted by Stewart Hildebrand 3 months, 2 weeks ago
On 2/10/26 05:55, Jan Beulich wrote:
> --- a/xen/drivers/vpci/cap.c
> +++ b/xen/drivers/vpci/cap.c
> @@ -376,6 +379,20 @@ void vpci_cleanup_capabilities(struct pc
>      }
>  }
>  
> +int vpci_reinit_ext_capabilities(struct pci_dev *pdev)
> +{
> +    if ( !pdev->vpci )
> +        return 0;
> +
> +    vpci_cleanup_capabilities(pdev, true);
In the case where pdev->ext_cfg transitions from true to false, it doesn't look
like this would actually result in the respective capability->cleanup() hook
being called, due to reliance on pci_find_ext_capability().
Re: [PATCH v4 5/5] vPCI: re-init extended-capabilities when MMCFG availability changed
Posted by Jan Beulich 3 months, 2 weeks ago
On 19.02.2026 23:21, Stewart Hildebrand wrote:
> On 2/10/26 05:55, Jan Beulich wrote:
>> --- a/xen/drivers/vpci/cap.c
>> +++ b/xen/drivers/vpci/cap.c
>> @@ -376,6 +379,20 @@ void vpci_cleanup_capabilities(struct pc
>>      }
>>  }
>>  
>> +int vpci_reinit_ext_capabilities(struct pci_dev *pdev)
>> +{
>> +    if ( !pdev->vpci )
>> +        return 0;
>> +
>> +    vpci_cleanup_capabilities(pdev, true);
> In the case where pdev->ext_cfg transitions from true to false, it doesn't look
> like this would actually result in the respective capability->cleanup() hook
> being called, due to reliance on pci_find_ext_capability().

Hmm, indeed. Yet that's a problem with vpci_cleanup_capabilities(), not
with the call here. It may have been merely latent until no later than
b1543cf5751b ("PCI: don't look for ext-caps when there's no extended cfg
space"). The cleanup hooks themselves (it's only one right now) then
also may not access their respective capabilities anymore (nor even just
try to locate them).

Jan
Re: [PATCH v4 5/5] vPCI: re-init extended-capabilities when MMCFG availability changed
Posted by Roger Pau Monné 3 months, 2 weeks ago
On Fri, Feb 20, 2026 at 08:13:58AM +0100, Jan Beulich wrote:
> On 19.02.2026 23:21, Stewart Hildebrand wrote:
> > On 2/10/26 05:55, Jan Beulich wrote:
> >> --- a/xen/drivers/vpci/cap.c
> >> +++ b/xen/drivers/vpci/cap.c
> >> @@ -376,6 +379,20 @@ void vpci_cleanup_capabilities(struct pc
> >>      }
> >>  }
> >>  
> >> +int vpci_reinit_ext_capabilities(struct pci_dev *pdev)
> >> +{
> >> +    if ( !pdev->vpci )
> >> +        return 0;
> >> +
> >> +    vpci_cleanup_capabilities(pdev, true);
> > In the case where pdev->ext_cfg transitions from true to false, it doesn't look
> > like this would actually result in the respective capability->cleanup() hook
> > being called, due to reliance on pci_find_ext_capability().
> 
> Hmm, indeed. Yet that's a problem with vpci_cleanup_capabilities(), not
> with the call here. It may have been merely latent until no later than
> b1543cf5751b ("PCI: don't look for ext-caps when there's no extended cfg
> space"). The cleanup hooks themselves (it's only one right now) then
> also may not access their respective capabilities anymore (nor even just
> try to locate them).

Cleanup hooks should be idempotent, so in principle there should be no
need to check whether the capability is present before attempting to
clean it up.  However cleanup_rebar() does check for the position of
the capability, and the MMCFG having disappeared would prevent
cleanup there.  At least that capability needs to be adjusted to cache
the position in the config space and the number of BARs, so that the
cleanup hook doesn't rely on PCI config space accesses to fetch any of
this.

Thanks, Roger.