If VFIO_IRQ_INFO_MASKABLE is set for VFIO_PCI_MSIX_IRQ_INDEX, record
this in ->can_mask_msix, and use it to individually mask MSI-X
interrupts as needed.
Originally-by: John Johnson <john.g.johnson@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/pci.h | 1 +
include/hw/vfio/vfio-device.h | 2 ++
hw/vfio/device.c | 26 ++++++++++++++++++++
hw/vfio/pci.c | 46 ++++++++++++++++++++++++++++++-----
4 files changed, 69 insertions(+), 6 deletions(-)
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index d4c6b2e7b7..e3a7d7bdca 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -191,6 +191,7 @@ struct VFIOPCIDevice {
bool defer_kvm_irq_routing;
bool clear_parent_atomics_on_exit;
bool skip_vsc_check;
+ bool can_mask_msix;
VFIODisplay *dpy;
Notifier irqchip_change_notifier;
};
diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
index 8bcb3c19f6..923f9cd116 100644
--- a/include/hw/vfio/vfio-device.h
+++ b/include/hw/vfio/vfio-device.h
@@ -133,7 +133,9 @@ struct VFIODeviceOps {
(ret < 0 ? strerror(-ret) : "short write")
void vfio_device_irq_disable(VFIODevice *vbasedev, int index);
+void vfio_device_irq_unmask_single(VFIODevice *vbasedev, int index, int irq);
void vfio_device_irq_unmask(VFIODevice *vbasedev, int index);
+void vfio_device_irq_mask_single(VFIODevice *vbasedev, int index, int irq);
void vfio_device_irq_mask(VFIODevice *vbasedev, int index);
bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex,
int action, int fd, Error **errp);
diff --git a/hw/vfio/device.c b/hw/vfio/device.c
index 9fba2c7272..d0068086ae 100644
--- a/hw/vfio/device.c
+++ b/hw/vfio/device.c
@@ -85,6 +85,19 @@ void vfio_device_irq_disable(VFIODevice *vbasedev, int index)
vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
}
+void vfio_device_irq_unmask_single(VFIODevice *vbasedev, int index, int irq)
+{
+ struct vfio_irq_set irq_set = {
+ .argsz = sizeof(irq_set),
+ .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
+ .index = index,
+ .start = irq,
+ .count = 1,
+ };
+
+ vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
+}
+
void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
{
struct vfio_irq_set irq_set = {
@@ -98,6 +111,19 @@ void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
}
+void vfio_device_irq_mask_single(VFIODevice *vbasedev, int index, int irq)
+{
+ struct vfio_irq_set irq_set = {
+ .argsz = sizeof(irq_set),
+ .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
+ .index = index,
+ .start = irq,
+ .count = 1,
+ };
+
+ vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
+}
+
void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
{
struct vfio_irq_set irq_set = {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index a49405660a..714d37e227 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -535,6 +535,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
{
VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
VFIOMSIVector *vector;
+ bool new_vec = false;
int ret;
bool resizing = !!(vdev->nr_vectors < nr + 1);
@@ -575,6 +576,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
kvm_irqchip_commit_route_changes(&vfio_route_change);
vfio_connect_kvm_msi_virq(vector);
}
+ new_vec = true;
}
}
@@ -584,6 +586,9 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
* in use, so we shutdown and incrementally increase them as needed.
* nr_vectors represents the total number of vectors allocated.
*
+ * Otherwise, unmask the vector if the vector is already setup (and we can
+ * do so) or send the fd if not.
+ *
* When dynamic allocation is supported, let the host only allocate
* and enable a vector when it is in use in guest. nr_vectors represents
* the upper bound of vectors being enabled (but not all of the ranges
@@ -594,13 +599,20 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
}
if (!vdev->defer_kvm_irq_routing) {
- if (vdev->msix->noresize && resizing) {
- vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
- ret = vfio_enable_vectors(vdev, true);
- if (ret) {
- error_report("vfio: failed to enable vectors, %s",
- strerror(-ret));
+ if (resizing) {
+ if (vdev->msix->noresize) {
+ vfio_device_irq_disable(&vdev->vbasedev,
+ VFIO_PCI_MSIX_IRQ_INDEX);
+ ret = vfio_enable_vectors(vdev, true);
+ if (ret) {
+ error_report("vfio: failed to enable vectors, %d", ret);
+ }
+ } else {
+ set_irq_signalling(&vdev->vbasedev, vector, nr);
}
+ } else if (vdev->can_mask_msix && !new_vec) {
+ vfio_device_irq_unmask_single(&vdev->vbasedev,
+ VFIO_PCI_MSIX_IRQ_INDEX, nr);
} else {
set_irq_signalling(&vdev->vbasedev, vector, nr);
}
@@ -630,6 +642,13 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
+ /* just mask vector if peer supports it */
+ if (vdev->can_mask_msix) {
+ vfio_device_irq_mask_single(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
+ nr);
+ return;
+ }
+
/*
* There are still old guests that mask and unmask vectors on every
* interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of
@@ -702,6 +721,13 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
error_report("vfio: failed to enable vectors, %s",
strerror(-ret));
}
+ } else if (vdev->can_mask_msix) {
+ /*
+ * If we can use single irq masking, send an invalid fd on vector 0
+ * to enable MSI-X without any vectors enabled.
+ */
+ vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
+ 0, VFIO_IRQ_SET_ACTION_TRIGGER, -1, NULL);
} else {
/*
* Some communication channels between VF & PF or PF & fw rely on the
@@ -2842,6 +2868,14 @@ bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp)
}
}
+ ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
+ &irq_info);
+ if (ret == 0 && (irq_info.flags & VFIO_IRQ_INFO_MASKABLE)) {
+ vdev->can_mask_msix = true;
+ } else {
+ vdev->can_mask_msix = false;
+ }
+
ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info);
if (ret) {
/* This can fail for an old kernel or legacy PCI dev */
--
2.43.0
On Fri, Jun 06, 2025 at 05:10:34PM -0700, John Levon wrote:
> If VFIO_IRQ_INFO_MASKABLE is set for VFIO_PCI_MSIX_IRQ_INDEX, record
> this in ->can_mask_msix, and use it to individually mask MSI-X
> interrupts as needed.
I'm just going to drop this patch. Neither vfio nor libvfio-user (including
qemu-as-server) report MASKABLE for MSI-X anyway, so it doesn't seem relevant.
I'm not sure if Oracle had some other use case in mind or had previously tested
it somehow.
Furthermore, this:
> @@ -702,6 +721,13 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
> error_report("vfio: failed to enable vectors, %s",
> strerror(-ret));
> }
> + } else if (vdev->can_mask_msix) {
> + /*
> + * If we can use single irq masking, send an invalid fd on vector 0
> + * to enable MSI-X without any vectors enabled.
> + */
> + vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
> + 0, VFIO_IRQ_SET_ACTION_TRIGGER, -1, NULL);
> } else {
> /*
> * Some communication channels between VF & PF or PF & fw rely on the
Seems odd as it doesn't pass DATA_EVENTFD, unlike the vfio_enable_msix_no_vec()
below it; I have no idea why the difference or if it makes sense, but it doesn't
seem so.
regards
john
On 6/10/25 23:52, John Levon wrote: > On Fri, Jun 06, 2025 at 05:10:34PM -0700, John Levon wrote: > >> If VFIO_IRQ_INFO_MASKABLE is set for VFIO_PCI_MSIX_IRQ_INDEX, record >> this in ->can_mask_msix, and use it to individually mask MSI-X >> interrupts as needed. > > I'm just going to drop this patch. Neither vfio nor libvfio-user (including > qemu-as-server) report MASKABLE for MSI-X anyway, so it doesn't seem relevant. ok. Dropped. Thanks, C.
On 6/7/25 02:10, John Levon wrote:
> If VFIO_IRQ_INFO_MASKABLE is set for VFIO_PCI_MSIX_IRQ_INDEX, record
> this in ->can_mask_msix, and use it to individually mask MSI-X
> interrupts as needed.
>
> Originally-by: John Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
> hw/vfio/pci.h | 1 +
> include/hw/vfio/vfio-device.h | 2 ++
> hw/vfio/device.c | 26 ++++++++++++++++++++
> hw/vfio/pci.c | 46 ++++++++++++++++++++++++++++++-----
> 4 files changed, 69 insertions(+), 6 deletions(-)
I find these changes difficult to understand. Can you split them
a bit more ?
Thanks,
C.
>
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index d4c6b2e7b7..e3a7d7bdca 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -191,6 +191,7 @@ struct VFIOPCIDevice {
> bool defer_kvm_irq_routing;
> bool clear_parent_atomics_on_exit;
> bool skip_vsc_check;
> + bool can_mask_msix;
> VFIODisplay *dpy;
> Notifier irqchip_change_notifier;
> };
> diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
> index 8bcb3c19f6..923f9cd116 100644
> --- a/include/hw/vfio/vfio-device.h
> +++ b/include/hw/vfio/vfio-device.h
> @@ -133,7 +133,9 @@ struct VFIODeviceOps {
> (ret < 0 ? strerror(-ret) : "short write")
>
> void vfio_device_irq_disable(VFIODevice *vbasedev, int index);
> +void vfio_device_irq_unmask_single(VFIODevice *vbasedev, int index, int irq);
> void vfio_device_irq_unmask(VFIODevice *vbasedev, int index);
> +void vfio_device_irq_mask_single(VFIODevice *vbasedev, int index, int irq);
> void vfio_device_irq_mask(VFIODevice *vbasedev, int index);
> bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex,
> int action, int fd, Error **errp);
> diff --git a/hw/vfio/device.c b/hw/vfio/device.c
> index 9fba2c7272..d0068086ae 100644
> --- a/hw/vfio/device.c
> +++ b/hw/vfio/device.c
> @@ -85,6 +85,19 @@ void vfio_device_irq_disable(VFIODevice *vbasedev, int index)
> vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
> }
>
> +void vfio_device_irq_unmask_single(VFIODevice *vbasedev, int index, int irq)
> +{
> + struct vfio_irq_set irq_set = {
> + .argsz = sizeof(irq_set),
> + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
> + .index = index,
> + .start = irq,
> + .count = 1,
> + };
> +
> + vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
> +}
> +
> void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
> {
> struct vfio_irq_set irq_set = {
> @@ -98,6 +111,19 @@ void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
> vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
> }
>
> +void vfio_device_irq_mask_single(VFIODevice *vbasedev, int index, int irq)
> +{
> + struct vfio_irq_set irq_set = {
> + .argsz = sizeof(irq_set),
> + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
> + .index = index,
> + .start = irq,
> + .count = 1,
> + };
> +
> + vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
> +}
> +
> void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
> {
> struct vfio_irq_set irq_set = {
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index a49405660a..714d37e227 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -535,6 +535,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
> {
> VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> VFIOMSIVector *vector;
> + bool new_vec = false;
> int ret;
> bool resizing = !!(vdev->nr_vectors < nr + 1);
>
> @@ -575,6 +576,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
> kvm_irqchip_commit_route_changes(&vfio_route_change);
> vfio_connect_kvm_msi_virq(vector);
> }
> + new_vec = true;
> }
> }
>
> @@ -584,6 +586,9 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
> * in use, so we shutdown and incrementally increase them as needed.
> * nr_vectors represents the total number of vectors allocated.
> *
> + * Otherwise, unmask the vector if the vector is already setup (and we can
> + * do so) or send the fd if not.
> + *
> * When dynamic allocation is supported, let the host only allocate
> * and enable a vector when it is in use in guest. nr_vectors represents
> * the upper bound of vectors being enabled (but not all of the ranges
> @@ -594,13 +599,20 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
> }
>
> if (!vdev->defer_kvm_irq_routing) {
> - if (vdev->msix->noresize && resizing) {
> - vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
> - ret = vfio_enable_vectors(vdev, true);
> - if (ret) {
> - error_report("vfio: failed to enable vectors, %s",
> - strerror(-ret));
> + if (resizing) {
> + if (vdev->msix->noresize) {
> + vfio_device_irq_disable(&vdev->vbasedev,
> + VFIO_PCI_MSIX_IRQ_INDEX);
> + ret = vfio_enable_vectors(vdev, true);
> + if (ret) {
> + error_report("vfio: failed to enable vectors, %d", ret);
> + }
> + } else {
> + set_irq_signalling(&vdev->vbasedev, vector, nr);
> }
> + } else if (vdev->can_mask_msix && !new_vec) {
> + vfio_device_irq_unmask_single(&vdev->vbasedev,
> + VFIO_PCI_MSIX_IRQ_INDEX, nr);
> } else {
> set_irq_signalling(&vdev->vbasedev, vector, nr);
> }
> @@ -630,6 +642,13 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
>
> trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
>
> + /* just mask vector if peer supports it */
> + if (vdev->can_mask_msix) {
> + vfio_device_irq_mask_single(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
> + nr);
> + return;
> + }
> +
> /*
> * There are still old guests that mask and unmask vectors on every
> * interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of
> @@ -702,6 +721,13 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
> error_report("vfio: failed to enable vectors, %s",
> strerror(-ret));
> }
> + } else if (vdev->can_mask_msix) {
> + /*
> + * If we can use single irq masking, send an invalid fd on vector 0
> + * to enable MSI-X without any vectors enabled.
> + */
> + vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
> + 0, VFIO_IRQ_SET_ACTION_TRIGGER, -1, NULL);
> } else {
> /*
> * Some communication channels between VF & PF or PF & fw rely on the
> @@ -2842,6 +2868,14 @@ bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp)
> }
> }
>
> + ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
> + &irq_info);
> + if (ret == 0 && (irq_info.flags & VFIO_IRQ_INFO_MASKABLE)) {
> + vdev->can_mask_msix = true;
> + } else {
> + vdev->can_mask_msix = false;
> + }
> +
> ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info);
> if (ret) {
> /* This can fail for an old kernel or legacy PCI dev */
© 2016 - 2025 Red Hat, Inc.