[RFC] vfio/migration: reduce the msix virq setup cost in resume phase

Longpeng(Mike) posted 1 patch 2 years, 8 months ago
Test checkpatch failed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20210813040614.1764-1-longpeng2@huawei.com
Maintainers: Alex Williamson <alex.williamson@redhat.com>
hw/vfio/pci.c | 39 ++++++++++++++++++++++++++++++++++++++-
hw/vfio/pci.h |  2 ++
2 files changed, 40 insertions(+), 1 deletion(-)
[RFC] vfio/migration: reduce the msix virq setup cost in resume phase
Posted by Longpeng(Mike) 2 years, 8 months ago
In migration resume phase, all unmasked msix vectors need to be
setup when load the VF state. However, the setup operation would
takes longer if the VF has more unmasked vectors.

In our case, the VF has 65 vectors and each one spend 0.8ms on
setup operation (vfio_add_kvm_msi_virq -> kvm_irqchip_commit_routes),
the total cost of the VF is more than 40ms. Even worse, the VM has
8 VFs, so the downtime increase more than 320ms.

vfio_pci_load_config
  vfio_msix_enable
    msix_set_vector_notifiers
      for (vector = 0; vector < dev->msix_entries_nr; vector++) {
        vfio_msix_vector_do_use
          vfio_add_kvm_msi_virq
            kvm_irqchip_commit_routes <-- 0.8ms
      }

Originaly, We tried to batch all routes and just commit once
outside the loop, but it's not easy to fallback to qemu interrupt
if someone fails.

So this patch trys to defer the KVM interrupt setup, the unmasked
vector will use qemu interrupt as default and switch to kvm interrupt
once it fires.

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
---
 hw/vfio/pci.c | 39 ++++++++++++++++++++++++++++++++++++++-
 hw/vfio/pci.h |  2 ++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index e1ea1d8..dd35170 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -47,6 +47,8 @@
 
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
+static void vfio_add_kvm_msix_virq(VFIOPCIDevice *vdev,
+                                   VFIOMSIVector *vector, int nr);
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -347,6 +349,11 @@ static void vfio_msi_interrupt(void *opaque)
         get_msg = msix_get_message;
         notify = msix_notify;
 
+        if (unlikely(vector->need_switch)) {
+            vfio_add_kvm_msix_virq(vdev, vector, nr);
+            vector->need_switch = false;
+        }
+
         /* A masked vector firing needs to use the PBA, enable it */
         if (msix_is_masked(&vdev->pdev, nr)) {
             set_bit(nr, vdev->msix->pending);
@@ -438,6 +445,25 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
     vector->virq = virq;
 }
 
+static void
+vfio_add_kvm_msix_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, int nr)
+{
+    Error *err = NULL;
+    int fd;
+
+    vfio_add_kvm_msi_virq(vdev, vector, nr, true);
+    if (vector->virq < 0) {
+        return;
+    }
+
+    fd = event_notifier_get_fd(&vector->kvm_interrupt);
+    if (vfio_set_irq_signaling(&vdev->vbasedev,
+                               VFIO_PCI_MSIX_IRQ_INDEX, nr,
+                               VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
+        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+    }
+}
+
 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
 {
     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
@@ -490,7 +516,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
         }
     } else {
         if (msg) {
-            vfio_add_kvm_msi_virq(vdev, vector, nr, true);
+            if (unlikely(vdev->defer_set_virq)) {
+                vector->need_switch = true;
+            } else {
+                vfio_add_kvm_msi_virq(vdev, vector, nr, true);
+            }
         }
     }
 
@@ -566,6 +596,11 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
     }
 }
 
+static void inline vfio_msix_defer_set_virq(VFIOPCIDevice *vdev, bool defer)
+{
+    vdev->defer_set_virq = defer;
+}
+
 static void vfio_msix_enable(VFIOPCIDevice *vdev)
 {
     PCIDevice *pdev = &vdev->pdev;
@@ -2466,7 +2501,9 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
     if (msi_enabled(pdev)) {
         vfio_msi_enable(vdev);
     } else if (msix_enabled(pdev)) {
+        vfio_msix_defer_set_virq(vdev, true);
         vfio_msix_enable(vdev);
+        vfio_msix_defer_set_virq(vdev, false);
     }
 
     return ret;
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 6477751..846ae85 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -95,6 +95,7 @@ typedef struct VFIOMSIVector {
     struct VFIOPCIDevice *vdev; /* back pointer to device */
     int virq;
     bool use;
+    bool need_switch; /* switch to kvm interrupt ? */
 } VFIOMSIVector;
 
 enum {
@@ -171,6 +172,7 @@ struct VFIOPCIDevice {
     bool no_kvm_ioeventfd;
     bool no_vfio_ioeventfd;
     bool enable_ramfb;
+    bool defer_set_virq;
     VFIODisplay *dpy;
     Notifier irqchip_change_notifier;
 };
-- 
1.8.3.1


Re: [RFC] vfio/migration: reduce the msix virq setup cost in resume phase
Posted by Alex Williamson 2 years, 8 months ago
On Fri, 13 Aug 2021 12:06:14 +0800
"Longpeng(Mike)" <longpeng2@huawei.com> wrote:

> In migration resume phase, all unmasked msix vectors need to be
> setup when load the VF state. However, the setup operation would
> takes longer if the VF has more unmasked vectors.
> 
> In our case, the VF has 65 vectors and each one spend 0.8ms on
> setup operation (vfio_add_kvm_msi_virq -> kvm_irqchip_commit_routes),
> the total cost of the VF is more than 40ms. Even worse, the VM has
> 8 VFs, so the downtime increase more than 320ms.
> 
> vfio_pci_load_config
>   vfio_msix_enable
>     msix_set_vector_notifiers
>       for (vector = 0; vector < dev->msix_entries_nr; vector++) {
>         vfio_msix_vector_do_use
>           vfio_add_kvm_msi_virq
>             kvm_irqchip_commit_routes <-- 0.8ms
>       }
> 
> Originaly, We tried to batch all routes and just commit once
> outside the loop, but it's not easy to fallback to qemu interrupt
> if someone fails.

I'm not sure I follow here, once we setup the virq, what's the failure
path?  kvm_irqchip_commit_routes() returns void.  Were you looking at
adding a "defer" arg to kvm_irqchip_add_msi_route() that skips the
commit, which vfio_add_kvm_msi_virq() might set based on the migration
state and vfio_pci_load_config() could then trigger the commit?
There's more overhead that can be removed if VFIO_DEVICE_SET_IRQS could
be called once rather than per vector.

> So this patch trys to defer the KVM interrupt setup, the unmasked
> vector will use qemu interrupt as default and switch to kvm interrupt
> once it fires.
> 
> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
> ---
>  hw/vfio/pci.c | 39 ++++++++++++++++++++++++++++++++++++++-
>  hw/vfio/pci.h |  2 ++
>  2 files changed, 40 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index e1ea1d8..dd35170 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -47,6 +47,8 @@
>  
>  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> +static void vfio_add_kvm_msix_virq(VFIOPCIDevice *vdev,
> +                                   VFIOMSIVector *vector, int nr);
>  
>  /*
>   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> @@ -347,6 +349,11 @@ static void vfio_msi_interrupt(void *opaque)
>          get_msg = msix_get_message;
>          notify = msix_notify;
>  
> +        if (unlikely(vector->need_switch)) {
> +            vfio_add_kvm_msix_virq(vdev, vector, nr);
> +            vector->need_switch = false;
> +        }
> +

A better name might be "vector->kvm_routing_deferred".  Essentially this
is just a lazy setup of KVM routes, we could always do this, or we could
do this based on a device options.  I wonder if there are any affinity
benefits in the VM to defer the KVM route.

>          /* A masked vector firing needs to use the PBA, enable it */
>          if (msix_is_masked(&vdev->pdev, nr)) {
>              set_bit(nr, vdev->msix->pending);
> @@ -438,6 +445,25 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
>      vector->virq = virq;
>  }
>  
> +static void
> +vfio_add_kvm_msix_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, int nr)
> +{
> +    Error *err = NULL;
> +    int fd;
> +
> +    vfio_add_kvm_msi_virq(vdev, vector, nr, true);
> +    if (vector->virq < 0) {
> +        return;
> +    }
> +
> +    fd = event_notifier_get_fd(&vector->kvm_interrupt);
> +    if (vfio_set_irq_signaling(&vdev->vbasedev,
> +                               VFIO_PCI_MSIX_IRQ_INDEX, nr,
> +                               VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
> +        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
> +    }
> +}
> +
>  static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
>  {
>      kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
> @@ -490,7 +516,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>          }
>      } else {
>          if (msg) {
> -            vfio_add_kvm_msi_virq(vdev, vector, nr, true);
> +            if (unlikely(vdev->defer_set_virq)) {

Likewise this could be "vdev->defer_kvm_irq_routing" and we could apply
it to all IRQ types.

> +                vector->need_switch = true;
> +            } else {
> +                vfio_add_kvm_msi_virq(vdev, vector, nr, true);
> +            }
>          }
>      }
>  
> @@ -566,6 +596,11 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
>      }
>  }
>  
> +static void inline vfio_msix_defer_set_virq(VFIOPCIDevice *vdev, bool defer)
> +{
> +    vdev->defer_set_virq = defer;
> +}

A helper function seems excessive.

> +
>  static void vfio_msix_enable(VFIOPCIDevice *vdev)
>  {
>      PCIDevice *pdev = &vdev->pdev;
> @@ -2466,7 +2501,9 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>      if (msi_enabled(pdev)) {
>          vfio_msi_enable(vdev);
>      } else if (msix_enabled(pdev)) {
> +        vfio_msix_defer_set_virq(vdev, true);
>          vfio_msix_enable(vdev);
> +        vfio_msix_defer_set_virq(vdev, false);

This is the algorithm you want for 65+ vectors, but is this the
algorithm we want for 2 vectors?  Who should define that policy?
We should at least make lazy KVM routing setup a device option to be
able to test it outside of a migration, but should it be enabled by
default?  Would anyone mind too much if there was some additional
latency of each initial vector triggering?  Thanks,

Alex

>      }
>  
>      return ret;
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 6477751..846ae85 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -95,6 +95,7 @@ typedef struct VFIOMSIVector {
>      struct VFIOPCIDevice *vdev; /* back pointer to device */
>      int virq;
>      bool use;
> +    bool need_switch; /* switch to kvm interrupt ? */
>  } VFIOMSIVector;
>  
>  enum {
> @@ -171,6 +172,7 @@ struct VFIOPCIDevice {
>      bool no_kvm_ioeventfd;
>      bool no_vfio_ioeventfd;
>      bool enable_ramfb;
> +    bool defer_set_virq;
>      VFIODisplay *dpy;
>      Notifier irqchip_change_notifier;
>  };


Re: [RFC] vfio/migration: reduce the msix virq setup cost in resume phase
Posted by Longpeng (Mike, Cloud Infrastructure Service Product Dept.) 2 years, 8 months ago

在 2021/8/18 4:26, Alex Williamson 写道:
> On Fri, 13 Aug 2021 12:06:14 +0800
> "Longpeng(Mike)" <longpeng2@huawei.com> wrote:
> 
>> In migration resume phase, all unmasked msix vectors need to be
>> setup when load the VF state. However, the setup operation would
>> takes longer if the VF has more unmasked vectors.
>>
>> In our case, the VF has 65 vectors and each one spend 0.8ms on
>> setup operation (vfio_add_kvm_msi_virq -> kvm_irqchip_commit_routes),
>> the total cost of the VF is more than 40ms. Even worse, the VM has
>> 8 VFs, so the downtime increase more than 320ms.
>>
>> vfio_pci_load_config
>>   vfio_msix_enable
>>     msix_set_vector_notifiers
>>       for (vector = 0; vector < dev->msix_entries_nr; vector++) {
>>         vfio_msix_vector_do_use
>>           vfio_add_kvm_msi_virq
>>             kvm_irqchip_commit_routes <-- 0.8ms
>>       }
>>
>> Originaly, We tried to batch all routes and just commit once
>> outside the loop, but it's not easy to fallback to qemu interrupt
>> if someone fails.
> 
> I'm not sure I follow here, once we setup the virq, what's the failure
> path?  kvm_irqchip_commit_routes() returns void.  Were you looking at
> adding a "defer" arg to kvm_irqchip_add_msi_route() that skips the
> commit, which vfio_add_kvm_msi_virq() might set based on the migration
> state and vfio_pci_load_config() could then trigger the commit?

Yes, my implementation is almost exactly the same as you said here and it works,
but there's a semantic problem makes me suspect.

The calltrace in vfio_add_kvm_msi_virq is:
vfio_add_kvm_msi_virq
  kvm_irqchip_add_msi_route
    kvm_irqchip_commit_routes
      kvm_vm_ioctl(KVM_SET_GSI_ROUTING)
  kvm_irqchip_add_irqfd_notifier_gsi
    kvm_irqchip_assign_irqfd
      kvm_vm_ioctl(KVM_IRQFD)

I referred to some other places where need to assign irqfds, the asignment is
always after the virq is committed.
The KVM API doc does not declare the dependencies of them, but the existent code
seem implys the order. The "defer" makes them out of order, it can work at the
moment,  but not sure if KVM would change in the future.

I perfer "defer" too if we can make sure it's OK if the asigment and commit are
not in order. I hope we could reach agreement on this point first, and then I'll
continue to reply the comments below if still necessary.

So do you think we should keep the order of asignment and commit ?

> There's more overhead that can be removed if VFIO_DEVICE_SET_IRQS could
> be called once rather than per vector.

Yes, I've already optimized these overhead in our production before. I
saw the upstream also did ( commit ecebe53fe ) in this year, I'll backport
the upstream's soluation in order to keep pace with the community.

[ stop here ]

> 
>> So this patch trys to defer the KVM interrupt setup, the unmasked
>> vector will use qemu interrupt as default and switch to kvm interrupt
>> once it fires.
>>
>> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
>> ---
>>  hw/vfio/pci.c | 39 ++++++++++++++++++++++++++++++++++++++-
>>  hw/vfio/pci.h |  2 ++
>>  2 files changed, 40 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index e1ea1d8..dd35170 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -47,6 +47,8 @@
>>  
>>  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>>  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
>> +static void vfio_add_kvm_msix_virq(VFIOPCIDevice *vdev,
>> +                                   VFIOMSIVector *vector, int nr);
>>  
>>  /*
>>   * Disabling BAR mmaping can be slow, but toggling it around INTx can
>> @@ -347,6 +349,11 @@ static void vfio_msi_interrupt(void *opaque)
>>          get_msg = msix_get_message;
>>          notify = msix_notify;
>>  
>> +        if (unlikely(vector->need_switch)) {
>> +            vfio_add_kvm_msix_virq(vdev, vector, nr);
>> +            vector->need_switch = false;
>> +        }
>> +
> 
> A better name might be "vector->kvm_routing_deferred".  Essentially this
> is just a lazy setup of KVM routes, we could always do this, or we could
> do this based on a device options.  I wonder if there are any affinity
> benefits in the VM to defer the KVM route.
> 
>>          /* A masked vector firing needs to use the PBA, enable it */
>>          if (msix_is_masked(&vdev->pdev, nr)) {
>>              set_bit(nr, vdev->msix->pending);
>> @@ -438,6 +445,25 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
>>      vector->virq = virq;
>>  }
>>  
>> +static void
>> +vfio_add_kvm_msix_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, int nr)
>> +{
>> +    Error *err = NULL;
>> +    int fd;
>> +
>> +    vfio_add_kvm_msi_virq(vdev, vector, nr, true);
>> +    if (vector->virq < 0) {
>> +        return;
>> +    }
>> +
>> +    fd = event_notifier_get_fd(&vector->kvm_interrupt);
>> +    if (vfio_set_irq_signaling(&vdev->vbasedev,
>> +                               VFIO_PCI_MSIX_IRQ_INDEX, nr,
>> +                               VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
>> +        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
>> +    }
>> +}
>> +
>>  static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
>>  {
>>      kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
>> @@ -490,7 +516,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>          }
>>      } else {
>>          if (msg) {
>> -            vfio_add_kvm_msi_virq(vdev, vector, nr, true);
>> +            if (unlikely(vdev->defer_set_virq)) {
> 
> Likewise this could be "vdev->defer_kvm_irq_routing" and we could apply
> it to all IRQ types.
> 
>> +                vector->need_switch = true;
>> +            } else {
>> +                vfio_add_kvm_msi_virq(vdev, vector, nr, true);
>> +            }
>>          }
>>      }
>>  
>> @@ -566,6 +596,11 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
>>      }
>>  }
>>  
>> +static void inline vfio_msix_defer_set_virq(VFIOPCIDevice *vdev, bool defer)
>> +{
>> +    vdev->defer_set_virq = defer;
>> +}
> 
> A helper function seems excessive.
> 
>> +
>>  static void vfio_msix_enable(VFIOPCIDevice *vdev)
>>  {
>>      PCIDevice *pdev = &vdev->pdev;
>> @@ -2466,7 +2501,9 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>>      if (msi_enabled(pdev)) {
>>          vfio_msi_enable(vdev);
>>      } else if (msix_enabled(pdev)) {
>> +        vfio_msix_defer_set_virq(vdev, true);
>>          vfio_msix_enable(vdev);
>> +        vfio_msix_defer_set_virq(vdev, false);
> 
> This is the algorithm you want for 65+ vectors, but is this the
> algorithm we want for 2 vectors?  Who should define that policy?
> We should at least make lazy KVM routing setup a device option to be
> able to test it outside of a migration, but should it be enabled by
> default?  Would anyone mind too much if there was some additional
> latency of each initial vector triggering?  Thanks,
> > Alex
> 
>>      }
>>  
>>      return ret;
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index 6477751..846ae85 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -95,6 +95,7 @@ typedef struct VFIOMSIVector {
>>      struct VFIOPCIDevice *vdev; /* back pointer to device */
>>      int virq;
>>      bool use;
>> +    bool need_switch; /* switch to kvm interrupt ? */
>>  } VFIOMSIVector;
>>  
>>  enum {
>> @@ -171,6 +172,7 @@ struct VFIOPCIDevice {
>>      bool no_kvm_ioeventfd;
>>      bool no_vfio_ioeventfd;
>>      bool enable_ramfb;
>> +    bool defer_set_virq;
>>      VFIODisplay *dpy;
>>      Notifier irqchip_change_notifier;
>>  };
> 
> .
> 

Re: [RFC] vfio/migration: reduce the msix virq setup cost in resume phase
Posted by Longpeng (Mike, Cloud Infrastructure Service Product Dept.) 2 years, 8 months ago

在 2021/8/18 11:50, Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
写道:
> 
> 
> 在 2021/8/18 4:26, Alex Williamson 写道:
>> On Fri, 13 Aug 2021 12:06:14 +0800
>> "Longpeng(Mike)" <longpeng2@huawei.com> wrote:
>>
>>> In migration resume phase, all unmasked msix vectors need to be
>>> setup when load the VF state. However, the setup operation would
>>> takes longer if the VF has more unmasked vectors.
>>>
>>> In our case, the VF has 65 vectors and each one spend 0.8ms on
>>> setup operation (vfio_add_kvm_msi_virq -> kvm_irqchip_commit_routes),
>>> the total cost of the VF is more than 40ms. Even worse, the VM has
>>> 8 VFs, so the downtime increase more than 320ms.
>>>
>>> vfio_pci_load_config
>>>   vfio_msix_enable
>>>     msix_set_vector_notifiers
>>>       for (vector = 0; vector < dev->msix_entries_nr; vector++) {
>>>         vfio_msix_vector_do_use
>>>           vfio_add_kvm_msi_virq
>>>             kvm_irqchip_commit_routes <-- 0.8ms
>>>       }
>>>
>>> Originaly, We tried to batch all routes and just commit once
>>> outside the loop, but it's not easy to fallback to qemu interrupt
>>> if someone fails.
>>
>> I'm not sure I follow here, once we setup the virq, what's the failure
>> path?  kvm_irqchip_commit_routes() returns void.  Were you looking at
>> adding a "defer" arg to kvm_irqchip_add_msi_route() that skips the
>> commit, which vfio_add_kvm_msi_virq() might set based on the migration
>> state and vfio_pci_load_config() could then trigger the commit?
> 
> Yes, my implementation is almost exactly the same as you said here and it works,
> but there's a semantic problem makes me suspect.
> 
> The calltrace in vfio_add_kvm_msi_virq is:
> vfio_add_kvm_msi_virq
>   kvm_irqchip_add_msi_route
>     kvm_irqchip_commit_routes
>       kvm_vm_ioctl(KVM_SET_GSI_ROUTING)
>   kvm_irqchip_add_irqfd_notifier_gsi
>     kvm_irqchip_assign_irqfd
>       kvm_vm_ioctl(KVM_IRQFD)
> 
> I referred to some other places where need to assign irqfds, the asignment is
> always after the virq is committed.
> The KVM API doc does not declare the dependencies of them, but the existent code
> seem implys the order. The "defer" makes them out of order, it can work at the
> moment,  but not sure if KVM would change in the future.
> 
> I perfer "defer" too if we can make sure it's OK if the asigment and commit are
> not in order. I hope we could reach agreement on this point first, and then I'll
> continue to reply the comments below if still necessary.
> 
> So do you think we should keep the order of asignment and commit ?
> 
>> There's more overhead that can be removed if VFIO_DEVICE_SET_IRQS could
>> be called once rather than per vector.
> 
> Yes, I've already optimized these overhead in our production before. I
> saw the upstream also did ( commit ecebe53fe ) in this year, I'll backport
> the upstream's soluation in order to keep pace with the community.
> 

Oh, the commit ecebe53fe can not save the VFIO_DEVICE_SET_IRQS operations, it
still need to be called ( in vfio_set_irq_signaling ) for each unmasked vector
during the resume phase.

In my implementation, the VFIO_DEVICE_SET_IRQS is skipped totally and call
vfio_enable_vectors only once outside the loop. I'll send this optimization
together in the next version.

> [ stop here ]
> 
>>
>>> So this patch trys to defer the KVM interrupt setup, the unmasked
>>> vector will use qemu interrupt as default and switch to kvm interrupt
>>> once it fires.
>>>
>>> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
>>> ---
>>>  hw/vfio/pci.c | 39 ++++++++++++++++++++++++++++++++++++++-
>>>  hw/vfio/pci.h |  2 ++
>>>  2 files changed, 40 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>> index e1ea1d8..dd35170 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -47,6 +47,8 @@
>>>  
>>>  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>>>  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
>>> +static void vfio_add_kvm_msix_virq(VFIOPCIDevice *vdev,
>>> +                                   VFIOMSIVector *vector, int nr);
>>>  
>>>  /*
>>>   * Disabling BAR mmaping can be slow, but toggling it around INTx can
>>> @@ -347,6 +349,11 @@ static void vfio_msi_interrupt(void *opaque)
>>>          get_msg = msix_get_message;
>>>          notify = msix_notify;
>>>  
>>> +        if (unlikely(vector->need_switch)) {
>>> +            vfio_add_kvm_msix_virq(vdev, vector, nr);
>>> +            vector->need_switch = false;
>>> +        }
>>> +
>>
>> A better name might be "vector->kvm_routing_deferred".  Essentially this
>> is just a lazy setup of KVM routes, we could always do this, or we could
>> do this based on a device options.  I wonder if there are any affinity
>> benefits in the VM to defer the KVM route.
>>
>>>          /* A masked vector firing needs to use the PBA, enable it */
>>>          if (msix_is_masked(&vdev->pdev, nr)) {
>>>              set_bit(nr, vdev->msix->pending);
>>> @@ -438,6 +445,25 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
>>>      vector->virq = virq;
>>>  }
>>>  
>>> +static void
>>> +vfio_add_kvm_msix_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, int nr)
>>> +{
>>> +    Error *err = NULL;
>>> +    int fd;
>>> +
>>> +    vfio_add_kvm_msi_virq(vdev, vector, nr, true);
>>> +    if (vector->virq < 0) {
>>> +        return;
>>> +    }
>>> +
>>> +    fd = event_notifier_get_fd(&vector->kvm_interrupt);
>>> +    if (vfio_set_irq_signaling(&vdev->vbasedev,
>>> +                               VFIO_PCI_MSIX_IRQ_INDEX, nr,
>>> +                               VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
>>> +        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
>>> +    }
>>> +}
>>> +
>>>  static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
>>>  {
>>>      kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
>>> @@ -490,7 +516,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>>          }
>>>      } else {
>>>          if (msg) {
>>> -            vfio_add_kvm_msi_virq(vdev, vector, nr, true);
>>> +            if (unlikely(vdev->defer_set_virq)) {
>>
>> Likewise this could be "vdev->defer_kvm_irq_routing" and we could apply
>> it to all IRQ types.
>>
>>> +                vector->need_switch = true;
>>> +            } else {
>>> +                vfio_add_kvm_msi_virq(vdev, vector, nr, true);
>>> +            }
>>>          }
>>>      }
>>>  
>>> @@ -566,6 +596,11 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
>>>      }
>>>  }
>>>  
>>> +static void inline vfio_msix_defer_set_virq(VFIOPCIDevice *vdev, bool defer)
>>> +{
>>> +    vdev->defer_set_virq = defer;
>>> +}
>>
>> A helper function seems excessive.
>>
>>> +
>>>  static void vfio_msix_enable(VFIOPCIDevice *vdev)
>>>  {
>>>      PCIDevice *pdev = &vdev->pdev;
>>> @@ -2466,7 +2501,9 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>>>      if (msi_enabled(pdev)) {
>>>          vfio_msi_enable(vdev);
>>>      } else if (msix_enabled(pdev)) {
>>> +        vfio_msix_defer_set_virq(vdev, true);
>>>          vfio_msix_enable(vdev);
>>> +        vfio_msix_defer_set_virq(vdev, false);
>>
>> This is the algorithm you want for 65+ vectors, but is this the
>> algorithm we want for 2 vectors?  Who should define that policy?
>> We should at least make lazy KVM routing setup a device option to be
>> able to test it outside of a migration, but should it be enabled by
>> default?  Would anyone mind too much if there was some additional
>> latency of each initial vector triggering?  Thanks,
>>> Alex
>>
>>>      }
>>>  
>>>      return ret;
>>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>>> index 6477751..846ae85 100644
>>> --- a/hw/vfio/pci.h
>>> +++ b/hw/vfio/pci.h
>>> @@ -95,6 +95,7 @@ typedef struct VFIOMSIVector {
>>>      struct VFIOPCIDevice *vdev; /* back pointer to device */
>>>      int virq;
>>>      bool use;
>>> +    bool need_switch; /* switch to kvm interrupt ? */
>>>  } VFIOMSIVector;
>>>  
>>>  enum {
>>> @@ -171,6 +172,7 @@ struct VFIOPCIDevice {
>>>      bool no_kvm_ioeventfd;
>>>      bool no_vfio_ioeventfd;
>>>      bool enable_ramfb;
>>> +    bool defer_set_virq;
>>>      VFIODisplay *dpy;
>>>      Notifier irqchip_change_notifier;
>>>  };
>>
>> .
>>
> 
> .
>