When vpci fails to initialize a legacy capability of device, it just
returns an error and vPCI gets disabled for the whole device. That
most likely renders the device unusable, plus possibly causing issues
to Xen itself if guest attempts to program the native MSI or MSI-X
capabilities if present.
So, add new function to hide legacy capability when initialization
fails. And remove the failed legacy capability from the vpci emulated
legacy capability list.
Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
---
cc: "Roger Pau Monné" <roger.pau@citrix.com>
---
v6->v7 changes:
* Change the pointer parameter of vpci_get_register(),
vpci_get_previous_cap_register() and vpci_capability_hide() to be const.
v5->v6 changes:
* Rename parameter rm to r in vpci_get_register().
* Use for loop to compact the code of vpci_get_previous_cap_register().
* Rename prev_next_r to prev_r in vpci_capability_hide(().
* Add printing when cap init, cleanup and hide fail.
v4->v5 changes:
* Modify vpci_get_register() to delete some unnecessary check, so that
I don't need to move function vpci_register_cmp().
* Rename vpci_capability_mask() to vpci_capability_hide().
v3->v4 changes:
* Modify the commit message.
* In function vpci_get_previous_cap_register(), add an ASSERT_UNREACHABLE() if offset below 0x40.
* Modify vpci_capability_mask() to return error instead of using ASSERT.
* Use vpci_remove_register to remove PCI_CAP_LIST_ID register instead of open code.
* Add check "if ( !offset )" in vpci_capability_mask().
v2->v3 changes:
* Separated from the last version patch "vpci: Hide capability when it fails to initialize"
* Whole implementation changed because last version is wrong.
This version adds a new helper function vpci_get_register() and uses it to get
target handler and previous handler from vpci->handlers, then remove the target.
v1->v2 changes:
* Removed the "priorities" of initializing capabilities since it isn't used anymore.
* Added new function vpci_capability_mask() and vpci_ext_capability_mask() to
remove failed capability from list.
* Called vpci_make_msix_hole() in the end of init_msix().
Best regards,
Jiqian Chen.
---
xen/drivers/vpci/vpci.c | 109 +++++++++++++++++++++++++++++++++++++++-
1 file changed, 108 insertions(+), 1 deletion(-)
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index e7e5b64f1be4..a91c3d4a1415 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -83,6 +83,88 @@ static int assign_virtual_sbdf(struct pci_dev *pdev)
#endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
+static struct vpci_register *vpci_get_register(const struct vpci *vpci,
+ unsigned int offset,
+ unsigned int size)
+{
+ struct vpci_register *r;
+
+ ASSERT(spin_is_locked(&vpci->lock));
+
+ list_for_each_entry ( r, &vpci->handlers, node )
+ {
+ if ( r->offset == offset && r->size == size )
+ return r;
+
+ if ( offset <= r->offset )
+ break;
+ }
+
+ return NULL;
+}
+
+static struct vpci_register *vpci_get_previous_cap_register(
+ const struct vpci *vpci, unsigned int offset)
+{
+ uint32_t next;
+ struct vpci_register *r;
+
+ if ( offset < 0x40 )
+ {
+ ASSERT_UNREACHABLE();
+ return NULL;
+ }
+
+ for ( r = vpci_get_register(vpci, PCI_CAPABILITY_LIST, 1); r;
+ r = next >= 0x40 ? vpci_get_register(vpci,
+ next + PCI_CAP_LIST_NEXT, 1)
+ : NULL )
+ {
+ next = (uint32_t)(uintptr_t)r->private;
+ ASSERT(next == (uintptr_t)r->private);
+ if ( next == offset )
+ break;
+ }
+
+ return r;
+}
+
+static int vpci_capability_hide(const struct pci_dev *pdev, unsigned int cap)
+{
+ const unsigned int offset = pci_find_cap_offset(pdev->sbdf, cap);
+ struct vpci_register *prev_r, *next_r;
+ struct vpci *vpci = pdev->vpci;
+
+ if ( !offset )
+ {
+ ASSERT_UNREACHABLE();
+ return 0;
+ }
+
+ spin_lock(&vpci->lock);
+ prev_r = vpci_get_previous_cap_register(vpci, offset);
+ next_r = vpci_get_register(vpci, offset + PCI_CAP_LIST_NEXT, 1);
+ if ( !prev_r || !next_r )
+ {
+ spin_unlock(&vpci->lock);
+ return -ENODEV;
+ }
+
+ prev_r->private = next_r->private;
+ /*
+ * Not calling vpci_remove_register() here is to avoid redoing
+ * the register search
+ */
+ list_del(&next_r->node);
+ spin_unlock(&vpci->lock);
+ xfree(next_r);
+
+ if ( !is_hardware_domain(pdev->domain) )
+ return vpci_remove_register(vpci, offset + PCI_CAP_LIST_ID, 1);
+
+ return 0;
+}
+
static int vpci_init_capabilities(struct pci_dev *pdev)
{
for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
@@ -103,7 +185,32 @@ static int vpci_init_capabilities(struct pci_dev *pdev)
rc = capability->init(pdev);
if ( rc )
- return rc;
+ {
+ const char *type = is_ext ? "extended" : "legacy";
+
+ printk(XENLOG_WARNING "%pd %pp: init %s cap %u fail rc=%d, mask it\n",
+ pdev->domain, &pdev->sbdf, type, cap, rc);
+
+ if ( capability->cleanup )
+ {
+ rc = capability->cleanup(pdev);
+ if ( rc )
+ {
+ printk(XENLOG_ERR "%pd %pp: clean %s cap %u fail rc=%d\n",
+ pdev->domain, &pdev->sbdf, type, cap, rc);
+ return rc;
+ }
+ }
+
+ if ( !is_ext )
+ rc = vpci_capability_hide(pdev, cap);
+ if ( rc )
+ {
+ printk(XENLOG_ERR "%pd %pp: hide %s cap %u fail rc=%d\n",
+ pdev->domain, &pdev->sbdf, type, cap, rc);
+ return rc;
+ }
+ }
}
return 0;
--
2.34.1
On Fri, Jul 04, 2025 at 03:07:58PM +0800, Jiqian Chen wrote:
> When vpci fails to initialize a legacy capability of device, it just
> returns an error and vPCI gets disabled for the whole device. That
> most likely renders the device unusable, plus possibly causing issues
> to Xen itself if guest attempts to program the native MSI or MSI-X
> capabilities if present.
>
> So, add new function to hide legacy capability when initialization
> fails. And remove the failed legacy capability from the vpci emulated
> legacy capability list.
>
> Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
> ---
> cc: "Roger Pau Monné" <roger.pau@citrix.com>
> ---
> v6->v7 changes:
> * Change the pointer parameter of vpci_get_register(),
> vpci_get_previous_cap_register() and vpci_capability_hide() to be const.
>
> v5->v6 changes:
> * Rename parameter rm to r in vpci_get_register().
> * Use for loop to compact the code of vpci_get_previous_cap_register().
> * Rename prev_next_r to prev_r in vpci_capability_hide(().
> * Add printing when cap init, cleanup and hide fail.
>
> v4->v5 changes:
> * Modify vpci_get_register() to delete some unnecessary check, so that
> I don't need to move function vpci_register_cmp().
> * Rename vpci_capability_mask() to vpci_capability_hide().
>
> v3->v4 changes:
> * Modify the commit message.
> * In function vpci_get_previous_cap_register(), add an ASSERT_UNREACHABLE() if offset below 0x40.
> * Modify vpci_capability_mask() to return error instead of using ASSERT.
> * Use vpci_remove_register to remove PCI_CAP_LIST_ID register instead of open code.
> * Add check "if ( !offset )" in vpci_capability_mask().
>
> v2->v3 changes:
> * Separated from the last version patch "vpci: Hide capability when it fails to initialize"
> * Whole implementation changed because last version is wrong.
> This version adds a new helper function vpci_get_register() and uses it to get
> target handler and previous handler from vpci->handlers, then remove the target.
>
> v1->v2 changes:
> * Removed the "priorities" of initializing capabilities since it isn't used anymore.
> * Added new function vpci_capability_mask() and vpci_ext_capability_mask() to
> remove failed capability from list.
> * Called vpci_make_msix_hole() in the end of init_msix().
>
> Best regards,
> Jiqian Chen.
> ---
> xen/drivers/vpci/vpci.c | 109 +++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 108 insertions(+), 1 deletion(-)
>
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index e7e5b64f1be4..a91c3d4a1415 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -83,6 +83,88 @@ static int assign_virtual_sbdf(struct pci_dev *pdev)
>
> #endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
>
> +static struct vpci_register *vpci_get_register(const struct vpci *vpci,
> + unsigned int offset,
> + unsigned int size)
> +{
> + struct vpci_register *r;
> +
> + ASSERT(spin_is_locked(&vpci->lock));
> +
> + list_for_each_entry ( r, &vpci->handlers, node )
> + {
> + if ( r->offset == offset && r->size == size )
> + return r;
> +
> + if ( offset <= r->offset )
> + break;
> + }
> +
> + return NULL;
> +}
> +
> +static struct vpci_register *vpci_get_previous_cap_register(
> + const struct vpci *vpci, unsigned int offset)
> +{
> + uint32_t next;
> + struct vpci_register *r;
> +
> + if ( offset < 0x40 )
> + {
> + ASSERT_UNREACHABLE();
> + return NULL;
> + }
> +
> + for ( r = vpci_get_register(vpci, PCI_CAPABILITY_LIST, 1); r;
> + r = next >= 0x40 ? vpci_get_register(vpci,
> + next + PCI_CAP_LIST_NEXT, 1)
> + : NULL )
> + {
> + next = (uint32_t)(uintptr_t)r->private;
Both 'next' type and the explicit truncation done here would better
use "unsigned int" to match the type of the input offset parameter?
> + ASSERT(next == (uintptr_t)r->private);
> + if ( next == offset )
> + break;
> + }
> +
> + return r;
> +}
> +
> +static int vpci_capability_hide(const struct pci_dev *pdev, unsigned int cap)
> +{
> + const unsigned int offset = pci_find_cap_offset(pdev->sbdf, cap);
> + struct vpci_register *prev_r, *next_r;
> + struct vpci *vpci = pdev->vpci;
> +
> + if ( !offset )
> + {
> + ASSERT_UNREACHABLE();
> + return 0;
> + }
> +
> + spin_lock(&vpci->lock);
> + prev_r = vpci_get_previous_cap_register(vpci, offset);
> + next_r = vpci_get_register(vpci, offset + PCI_CAP_LIST_NEXT, 1);
> + if ( !prev_r || !next_r )
> + {
> + spin_unlock(&vpci->lock);
> + return -ENODEV;
> + }
> +
> + prev_r->private = next_r->private;
> + /*
> + * Not calling vpci_remove_register() here is to avoid redoing
> + * the register search
> + */
> + list_del(&next_r->node);
> + spin_unlock(&vpci->lock);
> + xfree(next_r);
> +
> + if ( !is_hardware_domain(pdev->domain) )
> + return vpci_remove_register(vpci, offset + PCI_CAP_LIST_ID, 1);
> +
> + return 0;
> +}
> +
> static int vpci_init_capabilities(struct pci_dev *pdev)
> {
> for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
> @@ -103,7 +185,32 @@ static int vpci_init_capabilities(struct pci_dev *pdev)
>
> rc = capability->init(pdev);
> if ( rc )
> - return rc;
> + {
> + const char *type = is_ext ? "extended" : "legacy";
> +
> + printk(XENLOG_WARNING "%pd %pp: init %s cap %u fail rc=%d, mask it\n",
Nit: in order to avoid the strictly speaking overly long line here you
could split it like:
printk(XENLOG_WARNING
"%pd %pp: init %s cap %u fail rc=%d, mask it\n",
pdev->domain, ...
> + pdev->domain, &pdev->sbdf, type, cap, rc);
> +
> + if ( capability->cleanup )
> + {
> + rc = capability->cleanup(pdev);
> + if ( rc )
> + {
> + printk(XENLOG_ERR "%pd %pp: clean %s cap %u fail rc=%d\n",
> + pdev->domain, &pdev->sbdf, type, cap, rc);
I think it would be fine to not return error here for the hardware
domain, and try to mask the capability even if cleanup() has failed?
For the hardware domain it's likely better to not fail and attempt to
mask, rather than fail and then end up exposing the device without any
kind of vPCI mediation. For domU the proposed behavior is fine.
Thanks, Roger.
On 2025/7/21 23:48, Roger Pau Monné wrote:
> On Fri, Jul 04, 2025 at 03:07:58PM +0800, Jiqian Chen wrote:
>> When vpci fails to initialize a legacy capability of device, it just
>> returns an error and vPCI gets disabled for the whole device. That
>> most likely renders the device unusable, plus possibly causing issues
>> to Xen itself if guest attempts to program the native MSI or MSI-X
>> capabilities if present.
>>
>> So, add new function to hide legacy capability when initialization
>> fails. And remove the failed legacy capability from the vpci emulated
>> legacy capability list.
>>
>> Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
>> ---
>> cc: "Roger Pau Monné" <roger.pau@citrix.com>
>> ---
>> v6->v7 changes:
>> * Change the pointer parameter of vpci_get_register(),
>> vpci_get_previous_cap_register() and vpci_capability_hide() to be const.
>>
>> v5->v6 changes:
>> * Rename parameter rm to r in vpci_get_register().
>> * Use for loop to compact the code of vpci_get_previous_cap_register().
>> * Rename prev_next_r to prev_r in vpci_capability_hide(().
>> * Add printing when cap init, cleanup and hide fail.
>>
>> v4->v5 changes:
>> * Modify vpci_get_register() to delete some unnecessary check, so that
>> I don't need to move function vpci_register_cmp().
>> * Rename vpci_capability_mask() to vpci_capability_hide().
>>
>> v3->v4 changes:
>> * Modify the commit message.
>> * In function vpci_get_previous_cap_register(), add an ASSERT_UNREACHABLE() if offset below 0x40.
>> * Modify vpci_capability_mask() to return error instead of using ASSERT.
>> * Use vpci_remove_register to remove PCI_CAP_LIST_ID register instead of open code.
>> * Add check "if ( !offset )" in vpci_capability_mask().
>>
>> v2->v3 changes:
>> * Separated from the last version patch "vpci: Hide capability when it fails to initialize"
>> * Whole implementation changed because last version is wrong.
>> This version adds a new helper function vpci_get_register() and uses it to get
>> target handler and previous handler from vpci->handlers, then remove the target.
>>
>> v1->v2 changes:
>> * Removed the "priorities" of initializing capabilities since it isn't used anymore.
>> * Added new function vpci_capability_mask() and vpci_ext_capability_mask() to
>> remove failed capability from list.
>> * Called vpci_make_msix_hole() in the end of init_msix().
>>
>> Best regards,
>> Jiqian Chen.
>> ---
>> xen/drivers/vpci/vpci.c | 109 +++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 108 insertions(+), 1 deletion(-)
>>
>> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
>> index e7e5b64f1be4..a91c3d4a1415 100644
>> --- a/xen/drivers/vpci/vpci.c
>> +++ b/xen/drivers/vpci/vpci.c
>> @@ -83,6 +83,88 @@ static int assign_virtual_sbdf(struct pci_dev *pdev)
>>
>> #endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
>>
>> +static struct vpci_register *vpci_get_register(const struct vpci *vpci,
>> + unsigned int offset,
>> + unsigned int size)
>> +{
>> + struct vpci_register *r;
>> +
>> + ASSERT(spin_is_locked(&vpci->lock));
>> +
>> + list_for_each_entry ( r, &vpci->handlers, node )
>> + {
>> + if ( r->offset == offset && r->size == size )
>> + return r;
>> +
>> + if ( offset <= r->offset )
>> + break;
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> +static struct vpci_register *vpci_get_previous_cap_register(
>> + const struct vpci *vpci, unsigned int offset)
>> +{
>> + uint32_t next;
>> + struct vpci_register *r;
>> +
>> + if ( offset < 0x40 )
>> + {
>> + ASSERT_UNREACHABLE();
>> + return NULL;
>> + }
>> +
>> + for ( r = vpci_get_register(vpci, PCI_CAPABILITY_LIST, 1); r;
>> + r = next >= 0x40 ? vpci_get_register(vpci,
>> + next + PCI_CAP_LIST_NEXT, 1)
>> + : NULL )
>> + {
>> + next = (uint32_t)(uintptr_t)r->private;
>
> Both 'next' type and the explicit truncation done here would better
> use "unsigned int" to match the type of the input offset parameter?
Make sense, will change.
>
>> + ASSERT(next == (uintptr_t)r->private);
>> + if ( next == offset )
>> + break;
>> + }
>> +
>> + return r;
>> +}
>> +
>> +static int vpci_capability_hide(const struct pci_dev *pdev, unsigned int cap)
>> +{
>> + const unsigned int offset = pci_find_cap_offset(pdev->sbdf, cap);
>> + struct vpci_register *prev_r, *next_r;
>> + struct vpci *vpci = pdev->vpci;
>> +
>> + if ( !offset )
>> + {
>> + ASSERT_UNREACHABLE();
>> + return 0;
>> + }
>> +
>> + spin_lock(&vpci->lock);
>> + prev_r = vpci_get_previous_cap_register(vpci, offset);
>> + next_r = vpci_get_register(vpci, offset + PCI_CAP_LIST_NEXT, 1);
>> + if ( !prev_r || !next_r )
>> + {
>> + spin_unlock(&vpci->lock);
>> + return -ENODEV;
>> + }
>> +
>> + prev_r->private = next_r->private;
>> + /*
>> + * Not calling vpci_remove_register() here is to avoid redoing
>> + * the register search
>> + */
>> + list_del(&next_r->node);
>> + spin_unlock(&vpci->lock);
>> + xfree(next_r);
>> +
>> + if ( !is_hardware_domain(pdev->domain) )
>> + return vpci_remove_register(vpci, offset + PCI_CAP_LIST_ID, 1);
>> +
>> + return 0;
>> +}
>> +
>> static int vpci_init_capabilities(struct pci_dev *pdev)
>> {
>> for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
>> @@ -103,7 +185,32 @@ static int vpci_init_capabilities(struct pci_dev *pdev)
>>
>> rc = capability->init(pdev);
>> if ( rc )
>> - return rc;
>> + {
>> + const char *type = is_ext ? "extended" : "legacy";
>> +
>> + printk(XENLOG_WARNING "%pd %pp: init %s cap %u fail rc=%d, mask it\n",
>
> Nit: in order to avoid the strictly speaking overly long line here you
> could split it like:
>
> printk(XENLOG_WARNING
> "%pd %pp: init %s cap %u fail rc=%d, mask it\n",
> pdev->domain, ...
Will do.
>
>> + pdev->domain, &pdev->sbdf, type, cap, rc);
>> +
>> + if ( capability->cleanup )
>> + {
>> + rc = capability->cleanup(pdev);
>> + if ( rc )
>> + {
>> + printk(XENLOG_ERR "%pd %pp: clean %s cap %u fail rc=%d\n",
>> + pdev->domain, &pdev->sbdf, type, cap, rc);
>
> I think it would be fine to not return error here for the hardware
> domain, and try to mask the capability even if cleanup() has failed?
>
> For the hardware domain it's likely better to not fail and attempt to
> mask, rather than fail and then end up exposing the device without any
> kind of vPCI mediation. For domU the proposed behavior is fine.
Got it.
So here should be?
if ( rc && !is_hardware_domain(pdev->domain) )
{
printk(XENLOG_ERR "%pd %pp: clean %s cap %u fail rc=%d\n",
pdev->domain, &pdev->sbdf, type, cap, rc);
return rc;
}
or
if ( rc )
{
printk(XENLOG_ERR "%pd %pp: clean %s cap %u fail rc=%d\n",
pdev->domain, &pdev->sbdf, type, cap, rc);
if ( !is_hardware_domain(pdev->domain) )
return rc;
}
>
> Thanks, Roger.
--
Best regards,
Jiqian Chen.
On Wed, Jul 23, 2025 at 07:33:04AM +0000, Chen, Jiqian wrote:
> On 2025/7/21 23:48, Roger Pau Monné wrote:
> > On Fri, Jul 04, 2025 at 03:07:58PM +0800, Jiqian Chen wrote:
> >> + pdev->domain, &pdev->sbdf, type, cap, rc);
> >> +
> >> + if ( capability->cleanup )
> >> + {
> >> + rc = capability->cleanup(pdev);
> >> + if ( rc )
> >> + {
> >> + printk(XENLOG_ERR "%pd %pp: clean %s cap %u fail rc=%d\n",
> >> + pdev->domain, &pdev->sbdf, type, cap, rc);
> >
> > I think it would be fine to not return error here for the hardware
> > domain, and try to mask the capability even if cleanup() has failed?
> >
> > For the hardware domain it's likely better to not fail and attempt to
> > mask, rather than fail and then end up exposing the device without any
> > kind of vPCI mediation. For domU the proposed behavior is fine.
> Got it.
> So here should be?
> if ( rc && !is_hardware_domain(pdev->domain) )
> {
> printk(XENLOG_ERR "%pd %pp: clean %s cap %u fail rc=%d\n",
> pdev->domain, &pdev->sbdf, type, cap, rc);
> return rc;
> }
> or
> if ( rc )
> {
> printk(XENLOG_ERR "%pd %pp: clean %s cap %u fail rc=%d\n",
> pdev->domain, &pdev->sbdf, type, cap, rc);
> if ( !is_hardware_domain(pdev->domain) )
> return rc;
> }
The last one will be my preference, we want to log the error, just
not forward it to the caller.
Thanks, Roger.
© 2016 - 2025 Red Hat, Inc.