[PATCH v3 02/23] vfio: enable per-IRQ MSI-X masking

John Levon posted 23 patches 5 months, 1 week ago
Maintainers: John Levon <john.levon@nutanix.com>, Thanos Makatos <thanos.makatos@nutanix.com>, Alex Williamson <alex.williamson@redhat.com>, "Cédric Le Goater" <clg@redhat.com>, Paolo Bonzini <pbonzini@redhat.com>, "Marc-André Lureau" <marcandre.lureau@redhat.com>, "Daniel P. Berrangé" <berrange@redhat.com>, "Philippe Mathieu-Daudé" <philmd@linaro.org>
There is a newer version of this series
[PATCH v3 02/23] vfio: enable per-IRQ MSI-X masking
Posted by John Levon 5 months, 1 week ago
If VFIO_IRQ_INFO_MASKABLE is set for VFIO_PCI_MSIX_IRQ_INDEX, record
this in ->can_mask_msix, and use it to individually mask MSI-X
interrupts as needed.

Originally-by: John Johnson <john.g.johnson@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
Signed-off-by: John Levon <john.levon@nutanix.com>
---
 hw/vfio/pci.h                 |  1 +
 include/hw/vfio/vfio-device.h |  2 ++
 hw/vfio/device.c              | 26 ++++++++++++++++++++
 hw/vfio/pci.c                 | 46 ++++++++++++++++++++++++++++++-----
 4 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index d4c6b2e7b7..e3a7d7bdca 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -191,6 +191,7 @@ struct VFIOPCIDevice {
     bool defer_kvm_irq_routing;
     bool clear_parent_atomics_on_exit;
     bool skip_vsc_check;
+    bool can_mask_msix;
     VFIODisplay *dpy;
     Notifier irqchip_change_notifier;
 };
diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
index 8bcb3c19f6..923f9cd116 100644
--- a/include/hw/vfio/vfio-device.h
+++ b/include/hw/vfio/vfio-device.h
@@ -133,7 +133,9 @@ struct VFIODeviceOps {
     (ret < 0 ? strerror(-ret) : "short write")
 
 void vfio_device_irq_disable(VFIODevice *vbasedev, int index);
+void vfio_device_irq_unmask_single(VFIODevice *vbasedev, int index, int irq);
 void vfio_device_irq_unmask(VFIODevice *vbasedev, int index);
+void vfio_device_irq_mask_single(VFIODevice *vbasedev, int index, int irq);
 void vfio_device_irq_mask(VFIODevice *vbasedev, int index);
 bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex,
                                    int action, int fd, Error **errp);
diff --git a/hw/vfio/device.c b/hw/vfio/device.c
index 9fba2c7272..d0068086ae 100644
--- a/hw/vfio/device.c
+++ b/hw/vfio/device.c
@@ -85,6 +85,19 @@ void vfio_device_irq_disable(VFIODevice *vbasedev, int index)
     vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
 }
 
+void vfio_device_irq_unmask_single(VFIODevice *vbasedev, int index, int irq)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
+        .index = index,
+        .start = irq,
+        .count = 1,
+    };
+
+    vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
+}
+
 void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
 {
     struct vfio_irq_set irq_set = {
@@ -98,6 +111,19 @@ void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
     vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
 }
 
+void vfio_device_irq_mask_single(VFIODevice *vbasedev, int index, int irq)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
+        .index = index,
+        .start = irq,
+        .count = 1,
+    };
+
+    vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
+}
+
 void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
 {
     struct vfio_irq_set irq_set = {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index a49405660a..714d37e227 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -535,6 +535,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
 {
     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
     VFIOMSIVector *vector;
+    bool new_vec = false;
     int ret;
     bool resizing = !!(vdev->nr_vectors < nr + 1);
 
@@ -575,6 +576,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
                 kvm_irqchip_commit_route_changes(&vfio_route_change);
                 vfio_connect_kvm_msi_virq(vector);
             }
+            new_vec = true;
         }
     }
 
@@ -584,6 +586,9 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
      * in use, so we shutdown and incrementally increase them as needed.
      * nr_vectors represents the total number of vectors allocated.
      *
+     * Otherwise, unmask the vector if the vector is already setup (and we can
+     * do so) or send the fd if not.
+     *
      * When dynamic allocation is supported, let the host only allocate
      * and enable a vector when it is in use in guest. nr_vectors represents
      * the upper bound of vectors being enabled (but not all of the ranges
@@ -594,13 +599,20 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
     }
 
     if (!vdev->defer_kvm_irq_routing) {
-        if (vdev->msix->noresize && resizing) {
-            vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
-            ret = vfio_enable_vectors(vdev, true);
-            if (ret) {
-                error_report("vfio: failed to enable vectors, %s",
-                             strerror(-ret));
+        if (resizing) {
+            if (vdev->msix->noresize) {
+                vfio_device_irq_disable(&vdev->vbasedev,
+                                        VFIO_PCI_MSIX_IRQ_INDEX);
+                ret = vfio_enable_vectors(vdev, true);
+                if (ret) {
+                    error_report("vfio: failed to enable vectors, %d", ret);
+                }
+            } else {
+                set_irq_signalling(&vdev->vbasedev, vector, nr);
             }
+        } else if (vdev->can_mask_msix && !new_vec) {
+            vfio_device_irq_unmask_single(&vdev->vbasedev,
+                                          VFIO_PCI_MSIX_IRQ_INDEX, nr);
         } else {
             set_irq_signalling(&vdev->vbasedev, vector, nr);
         }
@@ -630,6 +642,13 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
 
     trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
 
+    /* just mask vector if peer supports it */
+    if (vdev->can_mask_msix) {
+        vfio_device_irq_mask_single(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
+                                    nr);
+        return;
+    }
+
     /*
      * There are still old guests that mask and unmask vectors on every
      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
@@ -702,6 +721,13 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
             error_report("vfio: failed to enable vectors, %s",
                          strerror(-ret));
         }
+    } else if (vdev->can_mask_msix) {
+        /*
+         * If we can use single irq masking, send an invalid fd on vector 0
+         * to enable MSI-X without any vectors enabled.
+         */
+        vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
+                                      0, VFIO_IRQ_SET_ACTION_TRIGGER, -1, NULL);
     } else {
         /*
          * Some communication channels between VF & PF or PF & fw rely on the
@@ -2842,6 +2868,14 @@ bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp)
         }
     }
 
+    ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
+                                   &irq_info);
+    if (ret == 0 && (irq_info.flags & VFIO_IRQ_INFO_MASKABLE)) {
+        vdev->can_mask_msix = true;
+    } else {
+        vdev->can_mask_msix = false;
+    }
+
     ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info);
     if (ret) {
         /* This can fail for an old kernel or legacy PCI dev */
-- 
2.43.0
Re: [PATCH v3 02/23] vfio: enable per-IRQ MSI-X masking
Posted by John Levon 5 months, 1 week ago
On Fri, Jun 06, 2025 at 05:10:34PM -0700, John Levon wrote:

> If VFIO_IRQ_INFO_MASKABLE is set for VFIO_PCI_MSIX_IRQ_INDEX, record
> this in ->can_mask_msix, and use it to individually mask MSI-X
> interrupts as needed.

I'm just going to drop this patch. Neither vfio nor libvfio-user (including
qemu-as-server) report MASKABLE for MSI-X anyway, so it doesn't seem relevant.

I'm not sure if Oracle had some other use case in mind or had previously tested
it somehow.

Furthermore, this:

> @@ -702,6 +721,13 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
>              error_report("vfio: failed to enable vectors, %s",
>                           strerror(-ret));
>          }
> +    } else if (vdev->can_mask_msix) {
> +        /*
> +         * If we can use single irq masking, send an invalid fd on vector 0
> +         * to enable MSI-X without any vectors enabled.
> +         */
> +        vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
> +                                      0, VFIO_IRQ_SET_ACTION_TRIGGER, -1, NULL);
>      } else {
>          /*
>           * Some communication channels between VF & PF or PF & fw rely on the

Seems odd as it doesn't pass DATA_EVENTFD, unlike the vfio_enable_msix_no_vec()
below it; I have no idea why the difference or if it makes sense, but it doesn't
seem so.

regards
john
Re: [PATCH v3 02/23] vfio: enable per-IRQ MSI-X masking
Posted by Cédric Le Goater 5 months, 1 week ago
On 6/10/25 23:52, John Levon wrote:
> On Fri, Jun 06, 2025 at 05:10:34PM -0700, John Levon wrote:
> 
>> If VFIO_IRQ_INFO_MASKABLE is set for VFIO_PCI_MSIX_IRQ_INDEX, record
>> this in ->can_mask_msix, and use it to individually mask MSI-X
>> interrupts as needed.
> 
> I'm just going to drop this patch. Neither vfio nor libvfio-user (including
> qemu-as-server) report MASKABLE for MSI-X anyway, so it doesn't seem relevant.

ok. Dropped.


Thanks,

C.
Re: [PATCH v3 02/23] vfio: enable per-IRQ MSI-X masking
Posted by Cédric Le Goater 5 months, 1 week ago
On 6/7/25 02:10, John Levon wrote:
> If VFIO_IRQ_INFO_MASKABLE is set for VFIO_PCI_MSIX_IRQ_INDEX, record
> this in ->can_mask_msix, and use it to individually mask MSI-X
> interrupts as needed.
> 
> Originally-by: John Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
>   hw/vfio/pci.h                 |  1 +
>   include/hw/vfio/vfio-device.h |  2 ++
>   hw/vfio/device.c              | 26 ++++++++++++++++++++
>   hw/vfio/pci.c                 | 46 ++++++++++++++++++++++++++++++-----
>   4 files changed, 69 insertions(+), 6 deletions(-)


I find these changes difficult to understand. Can you split them
a bit more ?


Thanks,

C.




> 
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index d4c6b2e7b7..e3a7d7bdca 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -191,6 +191,7 @@ struct VFIOPCIDevice {
>       bool defer_kvm_irq_routing;
>       bool clear_parent_atomics_on_exit;
>       bool skip_vsc_check;
> +    bool can_mask_msix;
>       VFIODisplay *dpy;
>       Notifier irqchip_change_notifier;
>   };
> diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
> index 8bcb3c19f6..923f9cd116 100644
> --- a/include/hw/vfio/vfio-device.h
> +++ b/include/hw/vfio/vfio-device.h
> @@ -133,7 +133,9 @@ struct VFIODeviceOps {
>       (ret < 0 ? strerror(-ret) : "short write")
>   
>   void vfio_device_irq_disable(VFIODevice *vbasedev, int index);
> +void vfio_device_irq_unmask_single(VFIODevice *vbasedev, int index, int irq);
>   void vfio_device_irq_unmask(VFIODevice *vbasedev, int index);
> +void vfio_device_irq_mask_single(VFIODevice *vbasedev, int index, int irq);
>   void vfio_device_irq_mask(VFIODevice *vbasedev, int index);
>   bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex,
>                                      int action, int fd, Error **errp);
> diff --git a/hw/vfio/device.c b/hw/vfio/device.c
> index 9fba2c7272..d0068086ae 100644
> --- a/hw/vfio/device.c
> +++ b/hw/vfio/device.c
> @@ -85,6 +85,19 @@ void vfio_device_irq_disable(VFIODevice *vbasedev, int index)
>       vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
>   }
>   
> +void vfio_device_irq_unmask_single(VFIODevice *vbasedev, int index, int irq)
> +{
> +    struct vfio_irq_set irq_set = {
> +        .argsz = sizeof(irq_set),
> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
> +        .index = index,
> +        .start = irq,
> +        .count = 1,
> +    };
> +
> +    vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
> +}
> +
>   void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
>   {
>       struct vfio_irq_set irq_set = {
> @@ -98,6 +111,19 @@ void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
>       vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
>   }
>   
> +void vfio_device_irq_mask_single(VFIODevice *vbasedev, int index, int irq)
> +{
> +    struct vfio_irq_set irq_set = {
> +        .argsz = sizeof(irq_set),
> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
> +        .index = index,
> +        .start = irq,
> +        .count = 1,
> +    };
> +
> +    vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
> +}
> +
>   void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
>   {
>       struct vfio_irq_set irq_set = {
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index a49405660a..714d37e227 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -535,6 +535,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>   {
>       VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
>       VFIOMSIVector *vector;
> +    bool new_vec = false;
>       int ret;
>       bool resizing = !!(vdev->nr_vectors < nr + 1);
>   
> @@ -575,6 +576,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>                   kvm_irqchip_commit_route_changes(&vfio_route_change);
>                   vfio_connect_kvm_msi_virq(vector);
>               }
> +            new_vec = true;
>           }
>       }
>   
> @@ -584,6 +586,9 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>        * in use, so we shutdown and incrementally increase them as needed.
>        * nr_vectors represents the total number of vectors allocated.
>        *
> +     * Otherwise, unmask the vector if the vector is already setup (and we can
> +     * do so) or send the fd if not.
> +     *
>        * When dynamic allocation is supported, let the host only allocate
>        * and enable a vector when it is in use in guest. nr_vectors represents
>        * the upper bound of vectors being enabled (but not all of the ranges
> @@ -594,13 +599,20 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>       }
>   
>       if (!vdev->defer_kvm_irq_routing) {
> -        if (vdev->msix->noresize && resizing) {
> -            vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
> -            ret = vfio_enable_vectors(vdev, true);
> -            if (ret) {
> -                error_report("vfio: failed to enable vectors, %s",
> -                             strerror(-ret));
> +        if (resizing) {
> +            if (vdev->msix->noresize) {
> +                vfio_device_irq_disable(&vdev->vbasedev,
> +                                        VFIO_PCI_MSIX_IRQ_INDEX);
> +                ret = vfio_enable_vectors(vdev, true);
> +                if (ret) {
> +                    error_report("vfio: failed to enable vectors, %d", ret);
> +                }
> +            } else {
> +                set_irq_signalling(&vdev->vbasedev, vector, nr);
>               }
> +        } else if (vdev->can_mask_msix && !new_vec) {
> +            vfio_device_irq_unmask_single(&vdev->vbasedev,
> +                                          VFIO_PCI_MSIX_IRQ_INDEX, nr);
>           } else {
>               set_irq_signalling(&vdev->vbasedev, vector, nr);
>           }
> @@ -630,6 +642,13 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
>   
>       trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
>   
> +    /* just mask vector if peer supports it */
> +    if (vdev->can_mask_msix) {
> +        vfio_device_irq_mask_single(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
> +                                    nr);
> +        return;
> +    }
> +
>       /*
>        * There are still old guests that mask and unmask vectors on every
>        * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
> @@ -702,6 +721,13 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
>               error_report("vfio: failed to enable vectors, %s",
>                            strerror(-ret));
>           }
> +    } else if (vdev->can_mask_msix) {
> +        /*
> +         * If we can use single irq masking, send an invalid fd on vector 0
> +         * to enable MSI-X without any vectors enabled.
> +         */
> +        vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
> +                                      0, VFIO_IRQ_SET_ACTION_TRIGGER, -1, NULL);
>       } else {
>           /*
>            * Some communication channels between VF & PF or PF & fw rely on the
> @@ -2842,6 +2868,14 @@ bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp)
>           }
>       }
>   
> +    ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
> +                                   &irq_info);
> +    if (ret == 0 && (irq_info.flags & VFIO_IRQ_INFO_MASKABLE)) {
> +        vdev->can_mask_msix = true;
> +    } else {
> +        vdev->can_mask_msix = false;
> +    }
> +
>       ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info);
>       if (ret) {
>           /* This can fail for an old kernel or legacy PCI dev */