When vpci fails to initialize a legacy capability of device, it just
return error instead of catching and processing exception. That makes
the entire device unusable.
So, add new a function to hide legacy capability when initialization
fails. And remove the failed legacy capability from the vpci emulated
legacy capability list.
Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
---
cc: "Roger Pau Monné" <roger.pau@citrix.com>
---
v2->v3 changes:
* Separated from the last version patch "vpci: Hide capability when it fails to initialize"
* Whole implementation changed because last version is wrong.
  This version adds a new helper function vpci_get_register() and uses it to get
  target handler and previous handler from vpci->handlers, then remove the target.
v1->v2 changes:
* Removed the "priorities" of initializing capabilities since it isn't used anymore.
* Added new function vpci_capability_mask() and vpci_ext_capability_mask() to
  remove failed capability from list.
* Called vpci_make_msix_hole() in the end of init_msix().
Best regards,
Jiqian Chen.
---
 xen/drivers/vpci/vpci.c | 133 +++++++++++++++++++++++++++++++++-------
 1 file changed, 112 insertions(+), 21 deletions(-)
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 5474b66668c1..f97c7cc460a0 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -35,6 +35,22 @@ struct vpci_register {
     uint32_t rsvdz_mask;
 };
 
+static int vpci_register_cmp(const struct vpci_register *r1,
+                             const struct vpci_register *r2)
+{
+    /* Return 0 if registers overlap. */
+    if ( r1->offset < r2->offset + r2->size &&
+         r2->offset < r1->offset + r1->size )
+        return 0;
+    if ( r1->offset < r2->offset )
+        return -1;
+    if ( r1->offset > r2->offset )
+        return 1;
+
+    ASSERT_UNREACHABLE();
+    return 0;
+}
+
 #ifdef __XEN__
 extern vpci_capability_t *const __start_vpci_array[];
 extern vpci_capability_t *const __end_vpci_array[];
@@ -83,7 +99,91 @@ static int assign_virtual_sbdf(struct pci_dev *pdev)
 
 #endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
 
-static int vpci_init_capabilities(struct pci_dev *pdev)
+static struct vpci_register *vpci_get_register(struct vpci *vpci,
+                                               const unsigned int offset,
+                                               const unsigned int size)
+{
+    const struct vpci_register r = { .offset = offset, .size = size };
+    struct vpci_register *rm;
+
+    ASSERT(spin_is_locked(&vpci->lock));
+    list_for_each_entry ( rm, &vpci->handlers, node )
+    {
+        int cmp = vpci_register_cmp(&r, rm);
+
+        if ( !cmp && rm->offset == offset && rm->size == size )
+            return rm;
+        if ( cmp <= 0 )
+            break;
+    }
+
+    return NULL;
+}
+
+static struct vpci_register *vpci_get_previous_cap_register
+                (struct vpci *vpci, const unsigned int offset)
+{
+    uint32_t next;
+    struct vpci_register *r;
+
+    if ( offset < 0x40 )
+        return NULL;
+
+    r = vpci_get_register(vpci, PCI_CAPABILITY_LIST, 1);
+    ASSERT(r);
+
+    next = (uint32_t)(uintptr_t)r->private;
+    while ( next >= 0x40 && next != offset )
+    {
+        r = vpci_get_register(vpci, next + PCI_CAP_LIST_NEXT, 1);
+        ASSERT(r);
+        next = (uint32_t)(uintptr_t)r->private;
+    }
+
+    if ( next < 0x40 )
+        return NULL;
+
+    return r;
+}
+
+static void vpci_capability_mask(struct pci_dev *pdev,
+                                 const unsigned int cap)
+{
+    const unsigned int offset = pci_find_cap_offset(pdev->sbdf, cap);
+    struct vpci_register *prev_next_r, *next_r;
+    struct vpci *vpci = pdev->vpci;
+
+    spin_lock(&vpci->lock);
+    next_r = vpci_get_register(vpci, offset + PCI_CAP_LIST_NEXT, 1);
+    if ( !next_r )
+    {
+        spin_unlock(&vpci->lock);
+        return;
+    }
+
+    prev_next_r = vpci_get_previous_cap_register(vpci, offset);
+    ASSERT(prev_next_r);
+
+    prev_next_r->private = next_r->private;
+
+    if ( !is_hardware_domain(pdev->domain) )
+    {
+        struct vpci_register *id_r =
+            vpci_get_register(vpci, offset + PCI_CAP_LIST_ID, 1);
+
+        ASSERT(id_r);
+        /* PCI_CAP_LIST_ID register of target capability */
+        list_del(&id_r->node);
+        xfree(id_r);
+    }
+
+    /* PCI_CAP_LIST_NEXT register of target capability */
+    list_del(&next_r->node);
+    spin_unlock(&vpci->lock);
+    xfree(next_r);
+}
+
+static void vpci_init_capabilities(struct pci_dev *pdev)
 {
     for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
     {
@@ -107,10 +207,17 @@ static int vpci_init_capabilities(struct pci_dev *pdev)
         rc = capability->init(pdev);
 
         if ( rc )
-            return rc;
+        {
+            if ( capability->fini )
+                capability->fini(pdev);
+
+            printk(XENLOG_WARNING "%pd %pp: %s cap %u init fail rc=%d, mask it\n",
+                   pdev->domain, &pdev->sbdf,
+                   is_ext ? "extended" : "legacy", cap, rc);
+            if ( !is_ext )
+                vpci_capability_mask(pdev, cap);
+        }
     }
-
-    return 0;
 }
 
 void vpci_deassign_device(struct pci_dev *pdev)
@@ -192,7 +299,7 @@ int vpci_assign_device(struct pci_dev *pdev)
     if ( rc )
         goto out;
 
-    rc = vpci_init_capabilities(pdev);
+    vpci_init_capabilities(pdev);
 
  out:
     if ( rc )
@@ -202,22 +309,6 @@ int vpci_assign_device(struct pci_dev *pdev)
 }
 #endif /* __XEN__ */
 
-static int vpci_register_cmp(const struct vpci_register *r1,
-                             const struct vpci_register *r2)
-{
-    /* Return 0 if registers overlap. */
-    if ( r1->offset < r2->offset + r2->size &&
-         r2->offset < r1->offset + r1->size )
-        return 0;
-    if ( r1->offset < r2->offset )
-        return -1;
-    if ( r1->offset > r2->offset )
-        return 1;
-
-    ASSERT_UNREACHABLE();
-    return 0;
-}
-
 /* Dummy hooks, writes are ignored, reads return 1's */
 static uint32_t cf_check vpci_ignored_read(
     const struct pci_dev *pdev, unsigned int reg, void *data)
-- 
2.34.1
On Mon, Apr 21, 2025 at 02:18:58PM +0800, Jiqian Chen wrote:
> When vpci fails to initialize a legacy capability of device, it just
> return error instead of catching and processing exception. That makes
> the entire device unusable.
I think "catching and processing exception" is a weird terminology to
use when writing C.  It's IMo more accurate to use:
"When vpci fails to initialize a legacy capability of device, it just
returns an error and vPCI gets disabled for the whole device.  That
most likely renders the device unusable, plus possibly causing issues
to Xen itself if guest attempts to program the native MSI or MSI-X
capabilities if present."
> So, add new a function to hide legacy capability when initialization
> fails. And remove the failed legacy capability from the vpci emulated
> legacy capability list.
> 
> Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
> ---
> cc: "Roger Pau Monné" <roger.pau@citrix.com>
> ---
> v2->v3 changes:
> * Separated from the last version patch "vpci: Hide capability when it fails to initialize"
> * Whole implementation changed because last version is wrong.
>   This version adds a new helper function vpci_get_register() and uses it to get
>   target handler and previous handler from vpci->handlers, then remove the target.
> 
> v1->v2 changes:
> * Removed the "priorities" of initializing capabilities since it isn't used anymore.
> * Added new function vpci_capability_mask() and vpci_ext_capability_mask() to
>   remove failed capability from list.
> * Called vpci_make_msix_hole() in the end of init_msix().
> 
> Best regards,
> Jiqian Chen.
> ---
>  xen/drivers/vpci/vpci.c | 133 +++++++++++++++++++++++++++++++++-------
>  1 file changed, 112 insertions(+), 21 deletions(-)
> 
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index 5474b66668c1..f97c7cc460a0 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -35,6 +35,22 @@ struct vpci_register {
>      uint32_t rsvdz_mask;
>  };
>  
> +static int vpci_register_cmp(const struct vpci_register *r1,
> +                             const struct vpci_register *r2)
> +{
> +    /* Return 0 if registers overlap. */
> +    if ( r1->offset < r2->offset + r2->size &&
> +         r2->offset < r1->offset + r1->size )
> +        return 0;
> +    if ( r1->offset < r2->offset )
> +        return -1;
> +    if ( r1->offset > r2->offset )
> +        return 1;
> +
> +    ASSERT_UNREACHABLE();
> +    return 0;
> +}
> +
>  #ifdef __XEN__
>  extern vpci_capability_t *const __start_vpci_array[];
>  extern vpci_capability_t *const __end_vpci_array[];
> @@ -83,7 +99,91 @@ static int assign_virtual_sbdf(struct pci_dev *pdev)
>  
>  #endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
>  
> -static int vpci_init_capabilities(struct pci_dev *pdev)
> +static struct vpci_register *vpci_get_register(struct vpci *vpci,
> +                                               const unsigned int offset,
> +                                               const unsigned int size)
We don't usually use const attributes for scalar function parameters.
> +{
> +    const struct vpci_register r = { .offset = offset, .size = size };
> +    struct vpci_register *rm;
> +
> +    ASSERT(spin_is_locked(&vpci->lock));
> +    list_for_each_entry ( rm, &vpci->handlers, node )
> +    {
> +        int cmp = vpci_register_cmp(&r, rm);
> +
> +        if ( !cmp && rm->offset == offset && rm->size == size )
> +            return rm;
> +        if ( cmp <= 0 )
> +            break;
> +    }
> +
> +    return NULL;
> +}
> +
> +static struct vpci_register *vpci_get_previous_cap_register
> +                (struct vpci *vpci, const unsigned int offset)
The style preference here would be:
static struct vpci_register *vpci_get_previous_cap_register(
    struct vpci *vpci, unsigned int offset)
{
...
> +{
> +    uint32_t next;
> +    struct vpci_register *r;
> +
> +    if ( offset < 0x40 )
I would possibly add an ASSERT_UNREACHABLE() here, as attempting to
pass an offset below 0x40 is a sign of a bug elsewhere?
> +        return NULL;
> +
> +    r = vpci_get_register(vpci, PCI_CAPABILITY_LIST, 1);
> +    ASSERT(r);
> +
> +    next = (uint32_t)(uintptr_t)r->private;
> +    while ( next >= 0x40 && next != offset )
> +    {
> +        r = vpci_get_register(vpci, next + PCI_CAP_LIST_NEXT, 1);
> +        ASSERT(r);
> +        next = (uint32_t)(uintptr_t)r->private;
> +    }
> +
> +    if ( next < 0x40 )
> +        return NULL;
> +
> +    return r;
> +}
> +
> +static void vpci_capability_mask(struct pci_dev *pdev,
This possibly needs to return an error code, as it can fail, and just
adding ASSERTs all around seems a bit clumsy, plus we might really
want to prevent assigning the device to the domain if
vpci_capability_mask() fails for whatever reason.
> +                                 const unsigned int cap)
> +{
> +    const unsigned int offset = pci_find_cap_offset(pdev->sbdf, cap);
> +    struct vpci_register *prev_next_r, *next_r;
> +    struct vpci *vpci = pdev->vpci;
> +
> +    spin_lock(&vpci->lock);
> +    next_r = vpci_get_register(vpci, offset + PCI_CAP_LIST_NEXT, 1);
> +    if ( !next_r )
> +    {
> +        spin_unlock(&vpci->lock);
> +        return;
> +    }
> +
> +    prev_next_r = vpci_get_previous_cap_register(vpci, offset);
> +    ASSERT(prev_next_r);
> +
> +    prev_next_r->private = next_r->private;
> +
> +    if ( !is_hardware_domain(pdev->domain) )
> +    {
> +        struct vpci_register *id_r =
> +            vpci_get_register(vpci, offset + PCI_CAP_LIST_ID, 1);
> +
> +        ASSERT(id_r);
> +        /* PCI_CAP_LIST_ID register of target capability */
> +        list_del(&id_r->node);
> +        xfree(id_r);
You could use vpci_remove_register() here?
> +    }
> +
> +    /* PCI_CAP_LIST_NEXT register of target capability */
> +    list_del(&next_r->node);
> +    spin_unlock(&vpci->lock);
> +    xfree(next_r);
Here vpci_remove_register() could also be used, but it will involve
searching again for the register to remove, which is a bit pointless.
> +}
> +
> +static void vpci_init_capabilities(struct pci_dev *pdev)
>  {
>      for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
>      {
> @@ -107,10 +207,17 @@ static int vpci_init_capabilities(struct pci_dev *pdev)
>          rc = capability->init(pdev);
>  
>          if ( rc )
> -            return rc;
> +        {
> +            if ( capability->fini )
> +                capability->fini(pdev);
> +
> +            printk(XENLOG_WARNING "%pd %pp: %s cap %u init fail rc=%d, mask it\n",
Best to split to next line:
              printk(XENLOG_WARNING
	             "%pd %pp: %s cap %u init fail rc=%d, mask it\n",
Thanks, Roger.
                
            On 2025/5/7 00:00, Roger Pau Monné wrote:
> On Mon, Apr 21, 2025 at 02:18:58PM +0800, Jiqian Chen wrote:
>> When vpci fails to initialize a legacy capability of device, it just
>> return error instead of catching and processing exception. That makes
>> the entire device unusable.
> 
> I think "catching and processing exception" is a weird terminology to
> use when writing C.  It's IMo more accurate to use:
> 
> "When vpci fails to initialize a legacy capability of device, it just
> returns an error and vPCI gets disabled for the whole device.  That
> most likely renders the device unusable, plus possibly causing issues
> to Xen itself if guest attempts to program the native MSI or MSI-X
> capabilities if present."
Thanks, will change.
> 
>> So, add new a function to hide legacy capability when initialization
>> fails. And remove the failed legacy capability from the vpci emulated
>> legacy capability list.
>>
>> Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
>> ---
>> cc: "Roger Pau Monné" <roger.pau@citrix.com>
>> ---
>> v2->v3 changes:
>> * Separated from the last version patch "vpci: Hide capability when it fails to initialize"
>> * Whole implementation changed because last version is wrong.
>>   This version adds a new helper function vpci_get_register() and uses it to get
>>   target handler and previous handler from vpci->handlers, then remove the target.
>>
>> v1->v2 changes:
>> * Removed the "priorities" of initializing capabilities since it isn't used anymore.
>> * Added new function vpci_capability_mask() and vpci_ext_capability_mask() to
>>   remove failed capability from list.
>> * Called vpci_make_msix_hole() in the end of init_msix().
>>
>> Best regards,
>> Jiqian Chen.
>> ---
>>  xen/drivers/vpci/vpci.c | 133 +++++++++++++++++++++++++++++++++-------
>>  1 file changed, 112 insertions(+), 21 deletions(-)
>>
>> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
>> index 5474b66668c1..f97c7cc460a0 100644
>> --- a/xen/drivers/vpci/vpci.c
>> +++ b/xen/drivers/vpci/vpci.c
>> @@ -35,6 +35,22 @@ struct vpci_register {
>>      uint32_t rsvdz_mask;
>>  };
>>  
>> +static int vpci_register_cmp(const struct vpci_register *r1,
>> +                             const struct vpci_register *r2)
>> +{
>> +    /* Return 0 if registers overlap. */
>> +    if ( r1->offset < r2->offset + r2->size &&
>> +         r2->offset < r1->offset + r1->size )
>> +        return 0;
>> +    if ( r1->offset < r2->offset )
>> +        return -1;
>> +    if ( r1->offset > r2->offset )
>> +        return 1;
>> +
>> +    ASSERT_UNREACHABLE();
>> +    return 0;
>> +}
>> +
>>  #ifdef __XEN__
>>  extern vpci_capability_t *const __start_vpci_array[];
>>  extern vpci_capability_t *const __end_vpci_array[];
>> @@ -83,7 +99,91 @@ static int assign_virtual_sbdf(struct pci_dev *pdev)
>>  
>>  #endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
>>  
>> -static int vpci_init_capabilities(struct pci_dev *pdev)
>> +static struct vpci_register *vpci_get_register(struct vpci *vpci,
>> +                                               const unsigned int offset,
>> +                                               const unsigned int size)
> 
> We don't usually use const attributes for scalar function parameters.
> 
>> +{
>> +    const struct vpci_register r = { .offset = offset, .size = size };
>> +    struct vpci_register *rm;
>> +
>> +    ASSERT(spin_is_locked(&vpci->lock));
>> +    list_for_each_entry ( rm, &vpci->handlers, node )
>> +    {
>> +        int cmp = vpci_register_cmp(&r, rm);
>> +
>> +        if ( !cmp && rm->offset == offset && rm->size == size )
>> +            return rm;
>> +        if ( cmp <= 0 )
>> +            break;
>> +    }
>> +
>> +    return NULL;
>> +}
>> +
>> +static struct vpci_register *vpci_get_previous_cap_register
>> +                (struct vpci *vpci, const unsigned int offset)
> 
> The style preference here would be:
> 
> static struct vpci_register *vpci_get_previous_cap_register(
>     struct vpci *vpci, unsigned int offset)
> {
> ...
> 
>> +{
>> +    uint32_t next;
>> +    struct vpci_register *r;
>> +
>> +    if ( offset < 0x40 )
> 
> I would possibly add an ASSERT_UNREACHABLE() here, as attempting to
> pass an offset below 0x40 is a sign of a bug elsewhere?
Probably yes, I will add an ASSERT_UNREACHABLE() here.
> 
>> +        return NULL;
>> +
>> +    r = vpci_get_register(vpci, PCI_CAPABILITY_LIST, 1);
>> +    ASSERT(r);
>> +
>> +    next = (uint32_t)(uintptr_t)r->private;
>> +    while ( next >= 0x40 && next != offset )
>> +    {
>> +        r = vpci_get_register(vpci, next + PCI_CAP_LIST_NEXT, 1);
>> +        ASSERT(r);
>> +        next = (uint32_t)(uintptr_t)r->private;
>> +    }
>> +
>> +    if ( next < 0x40 )
>> +        return NULL;
>> +
>> +    return r;
>> +}
>> +
>> +static void vpci_capability_mask(struct pci_dev *pdev,
> 
> This possibly needs to return an error code, as it can fail, and just
> adding ASSERTs all around seems a bit clumsy, plus we might really
> want to prevent assigning the device to the domain if
> vpci_capability_mask() fails for whatever reason.
Make sense. Will change to return error instead of ASSERT.
> 
>> +                                 const unsigned int cap)
>> +{
>> +    const unsigned int offset = pci_find_cap_offset(pdev->sbdf, cap);
>> +    struct vpci_register *prev_next_r, *next_r;
>> +    struct vpci *vpci = pdev->vpci;
>> +
>> +    spin_lock(&vpci->lock);
>> +    next_r = vpci_get_register(vpci, offset + PCI_CAP_LIST_NEXT, 1);
>> +    if ( !next_r )
>> +    {
>> +        spin_unlock(&vpci->lock);
>> +        return;
>> +    }
>> +
>> +    prev_next_r = vpci_get_previous_cap_register(vpci, offset);
>> +    ASSERT(prev_next_r);
>> +
>> +    prev_next_r->private = next_r->private;
>> +
>> +    if ( !is_hardware_domain(pdev->domain) )
>> +    {
>> +        struct vpci_register *id_r =
>> +            vpci_get_register(vpci, offset + PCI_CAP_LIST_ID, 1);
>> +
>> +        ASSERT(id_r);
>> +        /* PCI_CAP_LIST_ID register of target capability */
>> +        list_del(&id_r->node);
>> +        xfree(id_r);
> 
> You could use vpci_remove_register() here?
Right.
> 
>> +    }
>> +
>> +    /* PCI_CAP_LIST_NEXT register of target capability */
>> +    list_del(&next_r->node);
>> +    spin_unlock(&vpci->lock);
>> +    xfree(next_r);
> 
> Here vpci_remove_register() could also be used, but it will involve
> searching again for the register to remove, which is a bit pointless.
Yes, so just keeping things here instead of calling vpci_remove_register()?
> 
>> +}
>> +
>> +static void vpci_init_capabilities(struct pci_dev *pdev)
>>  {
>>      for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
>>      {
>> @@ -107,10 +207,17 @@ static int vpci_init_capabilities(struct pci_dev *pdev)
>>          rc = capability->init(pdev);
>>  
>>          if ( rc )
>> -            return rc;
>> +        {
>> +            if ( capability->fini )
>> +                capability->fini(pdev);
>> +
>> +            printk(XENLOG_WARNING "%pd %pp: %s cap %u init fail rc=%d, mask it\n",
> 
> Best to split to next line:
> 
>               printk(XENLOG_WARNING
> 	             "%pd %pp: %s cap %u init fail rc=%d, mask it\n",
> 
> Thanks, Roger.
-- 
Best regards,
Jiqian Chen.
                
            On Wed, May 07, 2025 at 06:38:45AM +0000, Chen, Jiqian wrote: > On 2025/5/7 00:00, Roger Pau Monné wrote: > > On Mon, Apr 21, 2025 at 02:18:58PM +0800, Jiqian Chen wrote: > >> + } > >> + > >> + /* PCI_CAP_LIST_NEXT register of target capability */ > >> + list_del(&next_r->node); > >> + spin_unlock(&vpci->lock); > >> + xfree(next_r); > > > > Here vpci_remove_register() could also be used, but it will involve > > searching again for the register to remove, which is a bit pointless. > Yes, so just keeping things here instead of calling vpci_remove_register()? I would add a small comment that we avoid calling vpci_remove_register() to not have to redo the register search. Thanks, Roger.
© 2016 - 2025 Red Hat, Inc.