virtio-net: initial iterative live migration support

[RFC 5/6] virtio, virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Jonah Palmer via 3 months, 3 weeks ago

Iterative live migration for virtio-net sends an initial
VMStateDescription while the source is still active. Because data
continues to flow for virtio-net, the guest's avail index continues to
increment after last_avail_idx had already been sent. This causes the
destination to often see something like this from virtio_error():

VQ 0 size 0x100 Guest index 0x0 inconsistent with Host index 0xc: delta 0xfff4

This patch suppresses this consistency check if we're loading the
initial VMStateDescriptions via iterative migration and unsuppresses
it for the stop-and-copy phase when the final VMStateDescriptions
(carrying the correct indices) are loaded.

A temporary VirtIODevMigration migration data structure is introduced here to
represent the iterative migration process for a VirtIODevice. For now it
just holds a flag to indicate whether or not the initial
VMStateDescription was sent during the iterative live migration process.

Signed-off-by: Jonah Palmer <jonah.palmer@oracle.com>
---
 hw/net/virtio-net.c        | 13 +++++++++++++
 hw/virtio/virtio.c         | 32 ++++++++++++++++++++++++--------
 include/hw/virtio/virtio.h |  6 ++++++
 3 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 86a6fe5b91..b7ac5e8278 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -3843,12 +3843,19 @@ static void virtio_net_save_cleanup(void *opaque)
 
 static int virtio_net_load_setup(QEMUFile *f, void *opaque, Error **errp)
 {
+    VirtIONet *n = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(n);
+    vdev->migration = g_new0(VirtIODevMigration, 1);
+    vdev->migration->iterative_vmstate_loaded = false;
+
     return 0;
 }
 
 static int virtio_net_load_state(QEMUFile *f, void *opaque, int version_id)
 {
     VirtIONet *n = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(n);
+    VirtIODevMigration *mig = vdev->migration;
     uint64_t flag;
 
     flag = qemu_get_be64(f);
@@ -3861,6 +3868,7 @@ static int virtio_net_load_state(QEMUFile *f, void *opaque, int version_id)
         case VNET_MIG_F_INIT_STATE:
         {
             vmstate_load_state(f, &vmstate_virtio_net, n, VIRTIO_NET_VM_VERSION);
+            mig->iterative_vmstate_loaded = true;
             break;
         }
         default:
@@ -3875,6 +3883,11 @@ static int virtio_net_load_state(QEMUFile *f, void *opaque, int version_id)
 
 static int virtio_net_load_cleanup(void *opaque)
 {
+    VirtIONet *n = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(n);
+    g_free(vdev->migration);
+    vdev->migration = NULL;
+
     return 0;
 }
 
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 5534251e01..68957ee7d1 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -3222,6 +3222,7 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
     int32_t config_len;
     uint32_t num;
     uint32_t features;
+    bool inconsistent_indices;
     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
     VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
@@ -3365,6 +3366,16 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
         if (vdev->vq[i].vring.desc) {
             uint16_t nheads;
 
+           /*
+            * Ring indices will be inconsistent during iterative migration. The actual
+            * indices will be sent later during the stop-and-copy phase.
+            */
+            if (vdev->migration) {
+                inconsistent_indices = !vdev->migration->iterative_vmstate_loaded;
+            } else {
+                inconsistent_indices = false;
+            }
+
             /*
              * VIRTIO-1 devices migrate desc, used, and avail ring addresses so
              * only the region cache needs to be set up.  Legacy devices need
@@ -3384,14 +3395,19 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
                 continue;
             }
 
-            nheads = vring_avail_idx(&vdev->vq[i]) - vdev->vq[i].last_avail_idx;
-            /* Check it isn't doing strange things with descriptor numbers. */
-            if (nheads > vdev->vq[i].vring.num) {
-                virtio_error(vdev, "VQ %d size 0x%x Guest index 0x%x "
-                             "inconsistent with Host index 0x%x: delta 0x%x",
-                             i, vdev->vq[i].vring.num,
-                             vring_avail_idx(&vdev->vq[i]),
-                             vdev->vq[i].last_avail_idx, nheads);
+            if (!inconsistent_indices) {
+                nheads = vring_avail_idx(&vdev->vq[i]) - vdev->vq[i].last_avail_idx;
+                /* Check it isn't doing strange things with descriptor numbers. */
+                if (nheads > vdev->vq[i].vring.num) {
+                    virtio_error(vdev, "VQ %d size 0x%x Guest index 0x%x "
+                                 "inconsistent with Host index 0x%x: delta 0x%x",
+                                 i, vdev->vq[i].vring.num,
+                                 vring_avail_idx(&vdev->vq[i]),
+                                 vdev->vq[i].last_avail_idx, nheads);
+                    inconsistent_indices = true;
+                }
+            }
+            if (inconsistent_indices) {
                 vdev->vq[i].used_idx = 0;
                 vdev->vq[i].shadow_avail_idx = 0;
                 vdev->vq[i].inuse = 0;
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 214d4a77e9..06b6e6ba65 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -98,6 +98,11 @@ enum virtio_device_endian {
     VIRTIO_DEVICE_ENDIAN_BIG,
 };
 
+/* VirtIODevice iterative live migration data structure */
+typedef struct VirtIODevMigration {
+    bool iterative_vmstate_loaded;
+} VirtIODevMigration;
+
 /**
  * struct VirtIODevice - common VirtIO structure
  * @name: name of the device
@@ -151,6 +156,7 @@ struct VirtIODevice
     bool disable_legacy_check;
     bool vhost_started;
     VMChangeStateEntry *vmstate;
+    VirtIODevMigration *migration;
     char *bus_name;
     uint8_t device_endian;
     /**
-- 
2.47.1

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Peter Xu 3 months, 1 week ago

On Tue, Jul 22, 2025 at 12:41:26PM +0000, Jonah Palmer wrote:
> Iterative live migration for virtio-net sends an initial
> VMStateDescription while the source is still active. Because data
> continues to flow for virtio-net, the guest's avail index continues to
> increment after last_avail_idx had already been sent. This causes the
> destination to often see something like this from virtio_error():
> 
> VQ 0 size 0x100 Guest index 0x0 inconsistent with Host index 0xc: delta 0xfff4

This is pretty much understanable, as vmstate_save() / vmstate_load() are,
IMHO, not designed to be used while VM is running.

To me, it's still illegal (per previous patch) to use vmstate_save_state()
while VM is running, in a save_setup() phase.

Some very high level questions from migration POV:

- Have we figured out why the downtime can be shrinked just by sending the
  vmstate twice?

  If we suspect it's memory got preheated, have we tried other ways to
  simply heat the memory up on dest side?  For example, some form of
  mlock[all]()?  IMHO it's pretty important we figure out the root of why
  such optimization came from.

  I do remember we have downtime issue with number of max_vqueues that may
  cause post_load() to be slow, I wonder there're other ways to improve it
  instead of vmstate_save(), especially in setup phase.

- Normally devices need iterative phase because:

  (a) the device may contain huge amount of data to transfer

      E.g. RAM and VFIO are good examples and fall into this category.

  (b) the device states are "iterable" from concept

      RAM is definitely true.  VFIO somehow mimiced that even though it was
      a streamed binary protocol..

  What's the answer for virtio-net here?  How large is the device state?
  Is this relevant to vDPA and real hardware (so virtio-net can look
  similar to VFIO at some point)?

Thanks,

-- 
Peter Xu

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Jonah Palmer 3 months, 1 week ago

On 8/6/25 12:27 PM, Peter Xu wrote:
> On Tue, Jul 22, 2025 at 12:41:26PM +0000, Jonah Palmer wrote:
>> Iterative live migration for virtio-net sends an initial
>> VMStateDescription while the source is still active. Because data
>> continues to flow for virtio-net, the guest's avail index continues to
>> increment after last_avail_idx had already been sent. This causes the
>> destination to often see something like this from virtio_error():
>>
>> VQ 0 size 0x100 Guest index 0x0 inconsistent with Host index 0xc: delta 0xfff4
> 
> This is pretty much understanable, as vmstate_save() / vmstate_load() are,
> IMHO, not designed to be used while VM is running.
> 
> To me, it's still illegal (per previous patch) to use vmstate_save_state()
> while VM is running, in a save_setup() phase.

Yea I understand where you're coming from. It just seemed too good to 
pass up on as a way to send and receive the entire state of a device.

I felt that if I were to implement something similar for iterative 
migration only that I'd, more or less, be duplicating a lot of already 
existing code or vmstate logic.

> 
> Some very high level questions from migration POV:
> 
> - Have we figured out why the downtime can be shrinked just by sending the
>    vmstate twice?
> 
>    If we suspect it's memory got preheated, have we tried other ways to
>    simply heat the memory up on dest side?  For example, some form of
>    mlock[all]()?  IMHO it's pretty important we figure out the root of why
>    such optimization came from.
> 
>    I do remember we have downtime issue with number of max_vqueues that may
>    cause post_load() to be slow, I wonder there're other ways to improve it
>    instead of vmstate_save(), especially in setup phase.
> 

Yea I believe that the downtime shrinks on the second vmstate_load_state 
due to preheated memory. But I'd like to stress that it's not my 
intention to resend the entire vmstate again during the stop-and-copy 
phase if iterative migration was used. A future iteration of this series 
will eventually include a more efficient approach to update the 
destination with any deltas since the vmstate was sent during the 
iterative portion (instead of just resending the entire vmstate again).

And yea there is an inefficiency regarding walking through 
VIRTIO_QUEUE_MAX (1024) VQs (twice with PCI) that I mentioned here in 
another comment: 
https://lore.kernel.org/qemu-devel/0f5b804d-3852-4159-b151-308a57f1ec74@oracle.com/

This might be better handled in a separate series though rather than as 
part of this one.

> - Normally devices need iterative phase because:
> 
>    (a) the device may contain huge amount of data to transfer
> 
>        E.g. RAM and VFIO are good examples and fall into this category.
> 
>    (b) the device states are "iterable" from concept
> 
>        RAM is definitely true.  VFIO somehow mimiced that even though it was
>        a streamed binary protocol..
> 
>    What's the answer for virtio-net here?  How large is the device state?
>    Is this relevant to vDPA and real hardware (so virtio-net can look
>    similar to VFIO at some point)?
> 

The main motivation behind implementing iterative migration for 
virtio-net is really to improve the guest visible downtime seen when 
migrating a vDPA device.

That is, by implementing iterative migration for virtio-net, we can see 
the state of the device early on and get a head start on work that's 
currently being done during the stop-and-copy phase. If we do this work 
before the stop-and-copy phase, we can further decrease the time spent 
in this window.

This would include work such as sending down the CVQ commands for 
queue-pair creation (even more beneficial for multiqueue), RSS, filters, 
etc.

I'm hoping to show this more explicitly in the next version of this RFC 
series that I'm working on now.

> Thanks,
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Peter Xu 3 months, 1 week ago

On Thu, Aug 07, 2025 at 10:18:38AM -0400, Jonah Palmer wrote:
> 
> 
> On 8/6/25 12:27 PM, Peter Xu wrote:
> > On Tue, Jul 22, 2025 at 12:41:26PM +0000, Jonah Palmer wrote:
> > > Iterative live migration for virtio-net sends an initial
> > > VMStateDescription while the source is still active. Because data
> > > continues to flow for virtio-net, the guest's avail index continues to
> > > increment after last_avail_idx had already been sent. This causes the
> > > destination to often see something like this from virtio_error():
> > > 
> > > VQ 0 size 0x100 Guest index 0x0 inconsistent with Host index 0xc: delta 0xfff4
> > 
> > This is pretty much understanable, as vmstate_save() / vmstate_load() are,
> > IMHO, not designed to be used while VM is running.
> > 
> > To me, it's still illegal (per previous patch) to use vmstate_save_state()
> > while VM is running, in a save_setup() phase.
> 
> Yea I understand where you're coming from. It just seemed too good to pass
> up on as a way to send and receive the entire state of a device.
> 
> I felt that if I were to implement something similar for iterative migration
> only that I'd, more or less, be duplicating a lot of already existing code
> or vmstate logic.
> 
> > 
> > Some very high level questions from migration POV:
> > 
> > - Have we figured out why the downtime can be shrinked just by sending the
> >    vmstate twice?
> > 
> >    If we suspect it's memory got preheated, have we tried other ways to
> >    simply heat the memory up on dest side?  For example, some form of
> >    mlock[all]()?  IMHO it's pretty important we figure out the root of why
> >    such optimization came from.
> > 
> >    I do remember we have downtime issue with number of max_vqueues that may
> >    cause post_load() to be slow, I wonder there're other ways to improve it
> >    instead of vmstate_save(), especially in setup phase.
> > 
> 
> Yea I believe that the downtime shrinks on the second vmstate_load_state due
> to preheated memory. But I'd like to stress that it's not my intention to
> resend the entire vmstate again during the stop-and-copy phase if iterative
> migration was used. A future iteration of this series will eventually
> include a more efficient approach to update the destination with any deltas
> since the vmstate was sent during the iterative portion (instead of just
> resending the entire vmstate again).
> 
> And yea there is an inefficiency regarding walking through VIRTIO_QUEUE_MAX
> (1024) VQs (twice with PCI) that I mentioned here in another comment: https://lore.kernel.org/qemu-devel/0f5b804d-3852-4159-b151-308a57f1ec74@oracle.com/
> 
> This might be better handled in a separate series though rather than as part
> of this one.

One thing to mention is I recall some other developer was trying to
optimize device load from memory side:

https://lore.kernel.org/all/20230317081904.24389-1-xuchuangxclwt@bytedance.com/

So maybe there're more than one way of doing this, and I'm not sure which
way is better, or both.

> 
> > - Normally devices need iterative phase because:
> > 
> >    (a) the device may contain huge amount of data to transfer
> > 
> >        E.g. RAM and VFIO are good examples and fall into this category.
> > 
> >    (b) the device states are "iterable" from concept
> > 
> >        RAM is definitely true.  VFIO somehow mimiced that even though it was
> >        a streamed binary protocol..
> > 
> >    What's the answer for virtio-net here?  How large is the device state?
> >    Is this relevant to vDPA and real hardware (so virtio-net can look
> >    similar to VFIO at some point)?
> 
> 
> The main motivation behind implementing iterative migration for virtio-net
> is really to improve the guest visible downtime seen when migrating a vDPA
> device.
> 
> That is, by implementing iterative migration for virtio-net, we can see the
> state of the device early on and get a head start on work that's currently
> being done during the stop-and-copy phase. If we do this work before the
> stop-and-copy phase, we can further decrease the time spent in this window.
> 
> This would include work such as sending down the CVQ commands for queue-pair
> creation (even more beneficial for multiqueue), RSS, filters, etc.
> 
> I'm hoping to show this more explicitly in the next version of this RFC
> series that I'm working on now.

OK, thanks for the context. I can wait and read the new version.

In all cases, please be noted that since migration thread does not take
BQL, it means either the setup or iterable phase may happen concurrently
with any of the vCPU threads.  I think it means maybe it's not wise to try
to iterate everything: please be ready to see e.g. 64bits MMIO register
being partially updated when dumping it to the wire, for example.

Do you have a rough estimation of the size of the device states to migrate?

Thanks,

-- 
Peter Xu

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Jonah Palmer 3 months ago


On 8/7/25 12:31 PM, Peter Xu wrote:
> On Thu, Aug 07, 2025 at 10:18:38AM -0400, Jonah Palmer wrote:
>>
>>
>> On 8/6/25 12:27 PM, Peter Xu wrote:
>>> On Tue, Jul 22, 2025 at 12:41:26PM +0000, Jonah Palmer wrote:
>>>> Iterative live migration for virtio-net sends an initial
>>>> VMStateDescription while the source is still active. Because data
>>>> continues to flow for virtio-net, the guest's avail index continues to
>>>> increment after last_avail_idx had already been sent. This causes the
>>>> destination to often see something like this from virtio_error():
>>>>
>>>> VQ 0 size 0x100 Guest index 0x0 inconsistent with Host index 0xc: delta 0xfff4
>>>
>>> This is pretty much understanable, as vmstate_save() / vmstate_load() are,
>>> IMHO, not designed to be used while VM is running.
>>>
>>> To me, it's still illegal (per previous patch) to use vmstate_save_state()
>>> while VM is running, in a save_setup() phase.
>>
>> Yea I understand where you're coming from. It just seemed too good to pass
>> up on as a way to send and receive the entire state of a device.
>>
>> I felt that if I were to implement something similar for iterative migration
>> only that I'd, more or less, be duplicating a lot of already existing code
>> or vmstate logic.
>>
>>>
>>> Some very high level questions from migration POV:
>>>
>>> - Have we figured out why the downtime can be shrinked just by sending the
>>>     vmstate twice?
>>>
>>>     If we suspect it's memory got preheated, have we tried other ways to
>>>     simply heat the memory up on dest side?  For example, some form of
>>>     mlock[all]()?  IMHO it's pretty important we figure out the root of why
>>>     such optimization came from.
>>>
>>>     I do remember we have downtime issue with number of max_vqueues that may
>>>     cause post_load() to be slow, I wonder there're other ways to improve it
>>>     instead of vmstate_save(), especially in setup phase.
>>>
>>
>> Yea I believe that the downtime shrinks on the second vmstate_load_state due
>> to preheated memory. But I'd like to stress that it's not my intention to
>> resend the entire vmstate again during the stop-and-copy phase if iterative
>> migration was used. A future iteration of this series will eventually
>> include a more efficient approach to update the destination with any deltas
>> since the vmstate was sent during the iterative portion (instead of just
>> resending the entire vmstate again).
>>
>> And yea there is an inefficiency regarding walking through VIRTIO_QUEUE_MAX
>> (1024) VQs (twice with PCI) that I mentioned here in another comment: https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/0f5b804d-3852-4159-b151-308a57f1ec74@oracle.com/__;!!ACWV5N9M2RV99hQ!Oyhh-o4V5gzcWsbmSxAkonhYn3xcLBF50-h-a9-D5MiKgbiHvkaAqdu1VZP5SVmuCk5GQu-sjFhL0IUC$
>>
>> This might be better handled in a separate series though rather than as part
>> of this one.
> 
> One thing to mention is I recall some other developer was trying to
> optimize device load from memory side:
> 
> https://urldefense.com/v3/__https://lore.kernel.org/all/20230317081904.24389-1-xuchuangxclwt@bytedance.com/__;!!ACWV5N9M2RV99hQ!Oyhh-o4V5gzcWsbmSxAkonhYn3xcLBF50-h-a9-D5MiKgbiHvkaAqdu1VZP5SVmuCk5GQu-sjBifRrAz$
> 
> So maybe there're more than one way of doing this, and I'm not sure which
> way is better, or both.
> 

Ack. I'll take a look at this.

>>
>>> - Normally devices need iterative phase because:
>>>
>>>     (a) the device may contain huge amount of data to transfer
>>>
>>>         E.g. RAM and VFIO are good examples and fall into this category.
>>>
>>>     (b) the device states are "iterable" from concept
>>>
>>>         RAM is definitely true.  VFIO somehow mimiced that even though it was
>>>         a streamed binary protocol..
>>>
>>>     What's the answer for virtio-net here?  How large is the device state?
>>>     Is this relevant to vDPA and real hardware (so virtio-net can look
>>>     similar to VFIO at some point)?
>>
>>
>> The main motivation behind implementing iterative migration for virtio-net
>> is really to improve the guest visible downtime seen when migrating a vDPA
>> device.
>>
>> That is, by implementing iterative migration for virtio-net, we can see the
>> state of the device early on and get a head start on work that's currently
>> being done during the stop-and-copy phase. If we do this work before the
>> stop-and-copy phase, we can further decrease the time spent in this window.
>>
>> This would include work such as sending down the CVQ commands for queue-pair
>> creation (even more beneficial for multiqueue), RSS, filters, etc.
>>
>> I'm hoping to show this more explicitly in the next version of this RFC
>> series that I'm working on now.
> 
> OK, thanks for the context. I can wait and read the new version.
> 
> In all cases, please be noted that since migration thread does not take
> BQL, it means either the setup or iterable phase may happen concurrently
> with any of the vCPU threads.  I think it means maybe it's not wise to try
> to iterate everything: please be ready to see e.g. 64bits MMIO register
> being partially updated when dumping it to the wire, for example.
> 

Gotcha. Some of the iterative hooks though like .save_setup, 
.load_state, etc. do hold the BQL though, right?

> Do you have a rough estimation of the size of the device states to migrate?
> 

Do you have a method at how I might be able to estimate this? I've been 
trying to get some kind of rough estimation but failing to do so.

> Thanks,
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Peter Xu 3 months ago

On Mon, Aug 11, 2025 at 08:30:19AM -0400, Jonah Palmer wrote:
> 
> 
> On 8/7/25 12:31 PM, Peter Xu wrote:
> > On Thu, Aug 07, 2025 at 10:18:38AM -0400, Jonah Palmer wrote:
> > > 
> > > 
> > > On 8/6/25 12:27 PM, Peter Xu wrote:
> > > > On Tue, Jul 22, 2025 at 12:41:26PM +0000, Jonah Palmer wrote:
> > > > > Iterative live migration for virtio-net sends an initial
> > > > > VMStateDescription while the source is still active. Because data
> > > > > continues to flow for virtio-net, the guest's avail index continues to
> > > > > increment after last_avail_idx had already been sent. This causes the
> > > > > destination to often see something like this from virtio_error():
> > > > > 
> > > > > VQ 0 size 0x100 Guest index 0x0 inconsistent with Host index 0xc: delta 0xfff4
> > > > 
> > > > This is pretty much understanable, as vmstate_save() / vmstate_load() are,
> > > > IMHO, not designed to be used while VM is running.
> > > > 
> > > > To me, it's still illegal (per previous patch) to use vmstate_save_state()
> > > > while VM is running, in a save_setup() phase.
> > > 
> > > Yea I understand where you're coming from. It just seemed too good to pass
> > > up on as a way to send and receive the entire state of a device.
> > > 
> > > I felt that if I were to implement something similar for iterative migration
> > > only that I'd, more or less, be duplicating a lot of already existing code
> > > or vmstate logic.
> > > 
> > > > 
> > > > Some very high level questions from migration POV:
> > > > 
> > > > - Have we figured out why the downtime can be shrinked just by sending the
> > > >     vmstate twice?
> > > > 
> > > >     If we suspect it's memory got preheated, have we tried other ways to
> > > >     simply heat the memory up on dest side?  For example, some form of
> > > >     mlock[all]()?  IMHO it's pretty important we figure out the root of why
> > > >     such optimization came from.
> > > > 
> > > >     I do remember we have downtime issue with number of max_vqueues that may
> > > >     cause post_load() to be slow, I wonder there're other ways to improve it
> > > >     instead of vmstate_save(), especially in setup phase.
> > > > 
> > > 
> > > Yea I believe that the downtime shrinks on the second vmstate_load_state due
> > > to preheated memory. But I'd like to stress that it's not my intention to
> > > resend the entire vmstate again during the stop-and-copy phase if iterative
> > > migration was used. A future iteration of this series will eventually
> > > include a more efficient approach to update the destination with any deltas
> > > since the vmstate was sent during the iterative portion (instead of just
> > > resending the entire vmstate again).
> > > 
> > > And yea there is an inefficiency regarding walking through VIRTIO_QUEUE_MAX
> > > (1024) VQs (twice with PCI) that I mentioned here in another comment: https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/0f5b804d-3852-4159-b151-308a57f1ec74@oracle.com/__;!!ACWV5N9M2RV99hQ!Oyhh-o4V5gzcWsbmSxAkonhYn3xcLBF50-h-a9-D5MiKgbiHvkaAqdu1VZP5SVmuCk5GQu-sjFhL0IUC$
> > > 
> > > This might be better handled in a separate series though rather than as part
> > > of this one.
> > 
> > One thing to mention is I recall some other developer was trying to
> > optimize device load from memory side:
> > 
> > https://urldefense.com/v3/__https://lore.kernel.org/all/20230317081904.24389-1-xuchuangxclwt@bytedance.com/__;!!ACWV5N9M2RV99hQ!Oyhh-o4V5gzcWsbmSxAkonhYn3xcLBF50-h-a9-D5MiKgbiHvkaAqdu1VZP5SVmuCk5GQu-sjBifRrAz$
> > 
> > So maybe there're more than one way of doing this, and I'm not sure which
> > way is better, or both.
> > 
> 
> Ack. I'll take a look at this.
> 
> > > 
> > > > - Normally devices need iterative phase because:
> > > > 
> > > >     (a) the device may contain huge amount of data to transfer
> > > > 
> > > >         E.g. RAM and VFIO are good examples and fall into this category.
> > > > 
> > > >     (b) the device states are "iterable" from concept
> > > > 
> > > >         RAM is definitely true.  VFIO somehow mimiced that even though it was
> > > >         a streamed binary protocol..
> > > > 
> > > >     What's the answer for virtio-net here?  How large is the device state?
> > > >     Is this relevant to vDPA and real hardware (so virtio-net can look
> > > >     similar to VFIO at some point)?
> > > 
> > > 
> > > The main motivation behind implementing iterative migration for virtio-net
> > > is really to improve the guest visible downtime seen when migrating a vDPA
> > > device.
> > > 
> > > That is, by implementing iterative migration for virtio-net, we can see the
> > > state of the device early on and get a head start on work that's currently
> > > being done during the stop-and-copy phase. If we do this work before the
> > > stop-and-copy phase, we can further decrease the time spent in this window.
> > > 
> > > This would include work such as sending down the CVQ commands for queue-pair
> > > creation (even more beneficial for multiqueue), RSS, filters, etc.
> > > 
> > > I'm hoping to show this more explicitly in the next version of this RFC
> > > series that I'm working on now.
> > 
> > OK, thanks for the context. I can wait and read the new version.
> > 
> > In all cases, please be noted that since migration thread does not take
> > BQL, it means either the setup or iterable phase may happen concurrently
> > with any of the vCPU threads.  I think it means maybe it's not wise to try
> > to iterate everything: please be ready to see e.g. 64bits MMIO register
> > being partially updated when dumping it to the wire, for example.
> > 
> 
> Gotcha. Some of the iterative hooks though like .save_setup, .load_state,
> etc. do hold the BQL though, right?

load_state() definitely needs the lock.

save_setup(), yes we have bql, but I really wish we don't depend on it, and
I don't know whether it'll keep holding true - AFAIU, the majority of it
really doesn't need the lock..  and I always wanted to see whether I can
remove it.

Normal iterations definitely runs without the lock.

> 
> > Do you have a rough estimation of the size of the device states to migrate?
> > 
> 
> Do you have a method at how I might be able to estimate this? I've been
> trying to get some kind of rough estimation but failing to do so.

Could I ask why you started this "migrate virtio-net in iteration phase"
effort?

I thought it was because there're a lot of data to migrate, and there
should be a way to estimate the minumum.  So is it not the case?

How about vDPA devices?  Do those devices have a lot of data to migrate?

We really need a good enough reason to have a device provide
save_iterate().  If it's only about "preheat some MMIO registers", we
should, IMHO, look at more generic ways first.

Thanks,

-- 
Peter Xu

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Jonah Palmer 3 months ago


On 8/11/25 9:39 AM, Peter Xu wrote:
> On Mon, Aug 11, 2025 at 08:30:19AM -0400, Jonah Palmer wrote:
>>
>>
>> On 8/7/25 12:31 PM, Peter Xu wrote:
>>> On Thu, Aug 07, 2025 at 10:18:38AM -0400, Jonah Palmer wrote:
>>>>
>>>>
>>>> On 8/6/25 12:27 PM, Peter Xu wrote:
>>>>> On Tue, Jul 22, 2025 at 12:41:26PM +0000, Jonah Palmer wrote:
>>>>>> Iterative live migration for virtio-net sends an initial
>>>>>> VMStateDescription while the source is still active. Because data
>>>>>> continues to flow for virtio-net, the guest's avail index continues to
>>>>>> increment after last_avail_idx had already been sent. This causes the
>>>>>> destination to often see something like this from virtio_error():
>>>>>>
>>>>>> VQ 0 size 0x100 Guest index 0x0 inconsistent with Host index 0xc: delta 0xfff4
>>>>>
>>>>> This is pretty much understanable, as vmstate_save() / vmstate_load() are,
>>>>> IMHO, not designed to be used while VM is running.
>>>>>
>>>>> To me, it's still illegal (per previous patch) to use vmstate_save_state()
>>>>> while VM is running, in a save_setup() phase.
>>>>
>>>> Yea I understand where you're coming from. It just seemed too good to pass
>>>> up on as a way to send and receive the entire state of a device.
>>>>
>>>> I felt that if I were to implement something similar for iterative migration
>>>> only that I'd, more or less, be duplicating a lot of already existing code
>>>> or vmstate logic.
>>>>
>>>>>
>>>>> Some very high level questions from migration POV:
>>>>>
>>>>> - Have we figured out why the downtime can be shrinked just by sending the
>>>>>      vmstate twice?
>>>>>
>>>>>      If we suspect it's memory got preheated, have we tried other ways to
>>>>>      simply heat the memory up on dest side?  For example, some form of
>>>>>      mlock[all]()?  IMHO it's pretty important we figure out the root of why
>>>>>      such optimization came from.
>>>>>
>>>>>      I do remember we have downtime issue with number of max_vqueues that may
>>>>>      cause post_load() to be slow, I wonder there're other ways to improve it
>>>>>      instead of vmstate_save(), especially in setup phase.
>>>>>
>>>>
>>>> Yea I believe that the downtime shrinks on the second vmstate_load_state due
>>>> to preheated memory. But I'd like to stress that it's not my intention to
>>>> resend the entire vmstate again during the stop-and-copy phase if iterative
>>>> migration was used. A future iteration of this series will eventually
>>>> include a more efficient approach to update the destination with any deltas
>>>> since the vmstate was sent during the iterative portion (instead of just
>>>> resending the entire vmstate again).
>>>>
>>>> And yea there is an inefficiency regarding walking through VIRTIO_QUEUE_MAX
>>>> (1024) VQs (twice with PCI) that I mentioned here in another comment: https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/0f5b804d-3852-4159-b151-308a57f1ec74@oracle.com/__;!!ACWV5N9M2RV99hQ!Oyhh-o4V5gzcWsbmSxAkonhYn3xcLBF50-h-a9-D5MiKgbiHvkaAqdu1VZP5SVmuCk5GQu-sjFhL0IUC$
>>>>
>>>> This might be better handled in a separate series though rather than as part
>>>> of this one.
>>>
>>> One thing to mention is I recall some other developer was trying to
>>> optimize device load from memory side:
>>>
>>> https://urldefense.com/v3/__https://lore.kernel.org/all/20230317081904.24389-1-xuchuangxclwt@bytedance.com/__;!!ACWV5N9M2RV99hQ!Oyhh-o4V5gzcWsbmSxAkonhYn3xcLBF50-h-a9-D5MiKgbiHvkaAqdu1VZP5SVmuCk5GQu-sjBifRrAz$
>>>
>>> So maybe there're more than one way of doing this, and I'm not sure which
>>> way is better, or both.
>>>
>>
>> Ack. I'll take a look at this.
>>
>>>>
>>>>> - Normally devices need iterative phase because:
>>>>>
>>>>>      (a) the device may contain huge amount of data to transfer
>>>>>
>>>>>          E.g. RAM and VFIO are good examples and fall into this category.
>>>>>
>>>>>      (b) the device states are "iterable" from concept
>>>>>
>>>>>          RAM is definitely true.  VFIO somehow mimiced that even though it was
>>>>>          a streamed binary protocol..
>>>>>
>>>>>      What's the answer for virtio-net here?  How large is the device state?
>>>>>      Is this relevant to vDPA and real hardware (so virtio-net can look
>>>>>      similar to VFIO at some point)?
>>>>
>>>>
>>>> The main motivation behind implementing iterative migration for virtio-net
>>>> is really to improve the guest visible downtime seen when migrating a vDPA
>>>> device.
>>>>
>>>> That is, by implementing iterative migration for virtio-net, we can see the
>>>> state of the device early on and get a head start on work that's currently
>>>> being done during the stop-and-copy phase. If we do this work before the
>>>> stop-and-copy phase, we can further decrease the time spent in this window.
>>>>
>>>> This would include work such as sending down the CVQ commands for queue-pair
>>>> creation (even more beneficial for multiqueue), RSS, filters, etc.
>>>>
>>>> I'm hoping to show this more explicitly in the next version of this RFC
>>>> series that I'm working on now.
>>>
>>> OK, thanks for the context. I can wait and read the new version.
>>>
>>> In all cases, please be noted that since migration thread does not take
>>> BQL, it means either the setup or iterable phase may happen concurrently
>>> with any of the vCPU threads.  I think it means maybe it's not wise to try
>>> to iterate everything: please be ready to see e.g. 64bits MMIO register
>>> being partially updated when dumping it to the wire, for example.
>>>
>>
>> Gotcha. Some of the iterative hooks though like .save_setup, .load_state,
>> etc. do hold the BQL though, right?
> 
> load_state() definitely needs the lock.
> 
> save_setup(), yes we have bql, but I really wish we don't depend on it, and
> I don't know whether it'll keep holding true - AFAIU, the majority of it
> really doesn't need the lock..  and I always wanted to see whether I can
> remove it.
> 
> Normal iterations definitely runs without the lock.
> 

Gotcha. Shouldn't be an issue for my implementation (for .save_setup 
anyway).

>>
>>> Do you have a rough estimation of the size of the device states to migrate?
>>>
>>
>> Do you have a method at how I might be able to estimate this? I've been
>> trying to get some kind of rough estimation but failing to do so.
> 
> Could I ask why you started this "migrate virtio-net in iteration phase"
> effort?
> 
> I thought it was because there're a lot of data to migrate, and there
> should be a way to estimate the minumum.  So is it not the case?
> 
> How about vDPA devices?  Do those devices have a lot of data to migrate?
> 
> We really need a good enough reason to have a device provide
> save_iterate().  If it's only about "preheat some MMIO registers", we
> should, IMHO, look at more generic ways first.
> 

This effort was started to reduce the guest visible downtime by 
virtio-net/vhost-net/vhost-vDPA during live migration, especially 
vhost-vDPA.

The downtime contributed by vhost-vDPA, for example, is not from having 
to migrate a lot of state but rather expensive backend control-plane 
latency like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN 
filters, offload settings, MTU, etc.). Doing this requires kernel/HW NIC 
operations which dominates its downtime.

In other words, by migrating the state of virtio-net early (before the 
stop-and-copy phase), we can also start staging backend configurations, 
which is the main contributor of downtime when migrating a vhost-vDPA 
device.

I apologize if this series gives the impression that we're migrating a 
lot of data here. It's more along the lines of moving control-plane 
latency out of the stop-and-copy phase.

> Thanks,
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Peter Xu 3 months ago

On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
> This effort was started to reduce the guest visible downtime by
> virtio-net/vhost-net/vhost-vDPA during live migration, especially
> vhost-vDPA.
> 
> The downtime contributed by vhost-vDPA, for example, is not from having to
> migrate a lot of state but rather expensive backend control-plane latency
> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
> settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
> dominates its downtime.
> 
> In other words, by migrating the state of virtio-net early (before the
> stop-and-copy phase), we can also start staging backend configurations,
> which is the main contributor of downtime when migrating a vhost-vDPA
> device.
> 
> I apologize if this series gives the impression that we're migrating a lot
> of data here. It's more along the lines of moving control-plane latency out
> of the stop-and-copy phase.

I see, thanks.

Please add these into the cover letter of the next post.  IMHO it's
extremely important information to explain the real goal of this work.  I
bet it is not expected for most people when reading the current cover
letter.

Then it could have nothing to do with iterative phase, am I right?

What are the data needed for the dest QEMU to start staging backend
configurations to the HWs underneath?  Does dest QEMU already have them in
the cmdlines?

Asking this because I want to know whether it can be done completely
without src QEMU at all, e.g. when dest QEMU starts.

If src QEMU's data is still needed, please also first consider providing
such facility using an "early VMSD" if it is ever possible: feel free to
refer to commit 3b95a71b22827d26178.

So the data to be transferred is still in VMSD form, aka, data are still
described by VMSD macros, instead of hard-coded streamline protocols using
e.g. qemufile APIs using save_setup()/load_setup().

When things are described in VMSDs, it get the most benefit from the live
migration framework, and it's much, much more flexible.  It's the most
suggested way for device to cooperate with live migration, savevmhandlers
are only the last resort because it's almost not in control of migration..

In short, please avoid using savevmhandlers as long as there can be any
other way to achieve similar results.

Thanks,

-- 
Peter Xu

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Eugenio Perez Martin 3 months ago

On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
> > This effort was started to reduce the guest visible downtime by
> > virtio-net/vhost-net/vhost-vDPA during live migration, especially
> > vhost-vDPA.
> >
> > The downtime contributed by vhost-vDPA, for example, is not from having to
> > migrate a lot of state but rather expensive backend control-plane latency
> > like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
> > settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
> > dominates its downtime.
> >
> > In other words, by migrating the state of virtio-net early (before the
> > stop-and-copy phase), we can also start staging backend configurations,
> > which is the main contributor of downtime when migrating a vhost-vDPA
> > device.
> >
> > I apologize if this series gives the impression that we're migrating a lot
> > of data here. It's more along the lines of moving control-plane latency out
> > of the stop-and-copy phase.
>
> I see, thanks.
>
> Please add these into the cover letter of the next post.  IMHO it's
> extremely important information to explain the real goal of this work.  I
> bet it is not expected for most people when reading the current cover
> letter.
>
> Then it could have nothing to do with iterative phase, am I right?
>
> What are the data needed for the dest QEMU to start staging backend
> configurations to the HWs underneath?  Does dest QEMU already have them in
> the cmdlines?
>
> Asking this because I want to know whether it can be done completely
> without src QEMU at all, e.g. when dest QEMU starts.
>
> If src QEMU's data is still needed, please also first consider providing
> such facility using an "early VMSD" if it is ever possible: feel free to
> refer to commit 3b95a71b22827d26178.
>

While it works for this series, it does not allow to resend the state
when the src device changes. For example, if the number of virtqueues
is modified.

> So the data to be transferred is still in VMSD form, aka, data are still
> described by VMSD macros, instead of hard-coded streamline protocols using
> e.g. qemufile APIs using save_setup()/load_setup().
>
> When things are described in VMSDs, it get the most benefit from the live
> migration framework, and it's much, much more flexible.  It's the most
> suggested way for device to cooperate with live migration, savevmhandlers
> are only the last resort because it's almost not in control of migration..
>
> In short, please avoid using savevmhandlers as long as there can be any
> other way to achieve similar results.
>
> Thanks,
>
> --
> Peter Xu
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Peter Xu 3 months ago

On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
> On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
> >
> > On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
> > > This effort was started to reduce the guest visible downtime by
> > > virtio-net/vhost-net/vhost-vDPA during live migration, especially
> > > vhost-vDPA.
> > >
> > > The downtime contributed by vhost-vDPA, for example, is not from having to
> > > migrate a lot of state but rather expensive backend control-plane latency
> > > like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
> > > settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
> > > dominates its downtime.
> > >
> > > In other words, by migrating the state of virtio-net early (before the
> > > stop-and-copy phase), we can also start staging backend configurations,
> > > which is the main contributor of downtime when migrating a vhost-vDPA
> > > device.
> > >
> > > I apologize if this series gives the impression that we're migrating a lot
> > > of data here. It's more along the lines of moving control-plane latency out
> > > of the stop-and-copy phase.
> >
> > I see, thanks.
> >
> > Please add these into the cover letter of the next post.  IMHO it's
> > extremely important information to explain the real goal of this work.  I
> > bet it is not expected for most people when reading the current cover
> > letter.
> >
> > Then it could have nothing to do with iterative phase, am I right?
> >
> > What are the data needed for the dest QEMU to start staging backend
> > configurations to the HWs underneath?  Does dest QEMU already have them in
> > the cmdlines?
> >
> > Asking this because I want to know whether it can be done completely
> > without src QEMU at all, e.g. when dest QEMU starts.
> >
> > If src QEMU's data is still needed, please also first consider providing
> > such facility using an "early VMSD" if it is ever possible: feel free to
> > refer to commit 3b95a71b22827d26178.
> >
> 
> While it works for this series, it does not allow to resend the state
> when the src device changes. For example, if the number of virtqueues
> is modified.

Some explanation on "how sync number of vqueues helps downtime" would help.
Not "it might preheat things", but exactly why, and how that differs when
it's pure software, and when hardware will be involved.

If it's only about pre-heat, could dest qemu preheat with max num of
vqueues?  Is it the same cost of downtime when growing num of queues,
v.s. shrinking num of queues?

For softwares, is it about memory transaction updates due to the vqueues?
If so, have we investigated a more generic approach on memory side, likely
some form of continuation from Chuang's work I previously mentioned?

-- 
Peter Xu

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Eugenio Perez Martin 3 months ago

On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
> > On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
> > >
> > > On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
> > > > This effort was started to reduce the guest visible downtime by
> > > > virtio-net/vhost-net/vhost-vDPA during live migration, especially
> > > > vhost-vDPA.
> > > >
> > > > The downtime contributed by vhost-vDPA, for example, is not from having to
> > > > migrate a lot of state but rather expensive backend control-plane latency
> > > > like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
> > > > settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
> > > > dominates its downtime.
> > > >
> > > > In other words, by migrating the state of virtio-net early (before the
> > > > stop-and-copy phase), we can also start staging backend configurations,
> > > > which is the main contributor of downtime when migrating a vhost-vDPA
> > > > device.
> > > >
> > > > I apologize if this series gives the impression that we're migrating a lot
> > > > of data here. It's more along the lines of moving control-plane latency out
> > > > of the stop-and-copy phase.
> > >
> > > I see, thanks.
> > >
> > > Please add these into the cover letter of the next post.  IMHO it's
> > > extremely important information to explain the real goal of this work.  I
> > > bet it is not expected for most people when reading the current cover
> > > letter.
> > >
> > > Then it could have nothing to do with iterative phase, am I right?
> > >
> > > What are the data needed for the dest QEMU to start staging backend
> > > configurations to the HWs underneath?  Does dest QEMU already have them in
> > > the cmdlines?
> > >
> > > Asking this because I want to know whether it can be done completely
> > > without src QEMU at all, e.g. when dest QEMU starts.
> > >
> > > If src QEMU's data is still needed, please also first consider providing
> > > such facility using an "early VMSD" if it is ever possible: feel free to
> > > refer to commit 3b95a71b22827d26178.
> > >
> >
> > While it works for this series, it does not allow to resend the state
> > when the src device changes. For example, if the number of virtqueues
> > is modified.
>
> Some explanation on "how sync number of vqueues helps downtime" would help.
> Not "it might preheat things", but exactly why, and how that differs when
> it's pure software, and when hardware will be involved.
>

By nvidia engineers to configure vqs (number, size, RSS, etc) takes
about ~200ms:
https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/

Adding Dragos here in case he can provide more details. Maybe the
numbers have changed though.

And I guess the difference with pure SW will always come down to PCI
communications, which assume it is slower than configuring the host SW
device in RAM or even CPU cache. But I admin that proper profiling is
needed before making those claims.

Jonah, can you print the time it takes to configure the vDPA device
with traces vs the time it takes to enable the dataplane of the
device? So we can get an idea of how much time we save with this.

> If it's only about pre-heat, could dest qemu preheat with max num of
> vqueues?  Is it the same cost of downtime when growing num of queues,
> v.s. shrinking num of queues?
>

Well you need to send the vq addresses and properties to preheat
these. If the address is invalid, the destination device will
interpret the vq address as the avail ring, for example, and will read
an invalid avail idx.

> For softwares, is it about memory transaction updates due to the vqueues?
> If so, have we investigated a more generic approach on memory side, likely
> some form of continuation from Chuang's work I previously mentioned?
>

This work is very interesting, and most of the downtime was because of
memory pinning indeed. Thanks for bringing it up! But the downtime is
not caused for the individual vq memory config, but for pinning all
the guest's memory for the device to access to it.

I think it is worth exploring if it affects the downtime in the case
of HW. I don't see any reason to reject that series but lack of
reviews, isn't it?

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Jonah Palmer 3 months ago


On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
> On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
>>
>> On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
>>> On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
>>>>
>>>> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
>>>>> This effort was started to reduce the guest visible downtime by
>>>>> virtio-net/vhost-net/vhost-vDPA during live migration, especially
>>>>> vhost-vDPA.
>>>>>
>>>>> The downtime contributed by vhost-vDPA, for example, is not from having to
>>>>> migrate a lot of state but rather expensive backend control-plane latency
>>>>> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
>>>>> settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
>>>>> dominates its downtime.
>>>>>
>>>>> In other words, by migrating the state of virtio-net early (before the
>>>>> stop-and-copy phase), we can also start staging backend configurations,
>>>>> which is the main contributor of downtime when migrating a vhost-vDPA
>>>>> device.
>>>>>
>>>>> I apologize if this series gives the impression that we're migrating a lot
>>>>> of data here. It's more along the lines of moving control-plane latency out
>>>>> of the stop-and-copy phase.
>>>>
>>>> I see, thanks.
>>>>
>>>> Please add these into the cover letter of the next post.  IMHO it's
>>>> extremely important information to explain the real goal of this work.  I
>>>> bet it is not expected for most people when reading the current cover
>>>> letter.
>>>>
>>>> Then it could have nothing to do with iterative phase, am I right?
>>>>
>>>> What are the data needed for the dest QEMU to start staging backend
>>>> configurations to the HWs underneath?  Does dest QEMU already have them in
>>>> the cmdlines?
>>>>
>>>> Asking this because I want to know whether it can be done completely
>>>> without src QEMU at all, e.g. when dest QEMU starts.
>>>>
>>>> If src QEMU's data is still needed, please also first consider providing
>>>> such facility using an "early VMSD" if it is ever possible: feel free to
>>>> refer to commit 3b95a71b22827d26178.
>>>>
>>>
>>> While it works for this series, it does not allow to resend the state
>>> when the src device changes. For example, if the number of virtqueues
>>> is modified.
>>
>> Some explanation on "how sync number of vqueues helps downtime" would help.
>> Not "it might preheat things", but exactly why, and how that differs when
>> it's pure software, and when hardware will be involved.
>>
> 
> By nvidia engineers to configure vqs (number, size, RSS, etc) takes
> about ~200ms:
> https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$
> 
> Adding Dragos here in case he can provide more details. Maybe the
> numbers have changed though.
> 
> And I guess the difference with pure SW will always come down to PCI
> communications, which assume it is slower than configuring the host SW
> device in RAM or even CPU cache. But I admin that proper profiling is
> needed before making those claims.
> 
> Jonah, can you print the time it takes to configure the vDPA device
> with traces vs the time it takes to enable the dataplane of the
> device? So we can get an idea of how much time we save with this.
> 

Let me know if this isn't what you're looking for.

I'm assuming by "configuration time" you mean:
  - Time from device startup (entry to vhost_vdpa_dev_start()) to right
    before we start enabling the vrings (e.g.
    VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).

And by "time taken to enable the dataplane" I'm assuming you mean:
  - Time right before we start enabling the vrings (see above) to right
    after we enable the last vring (at the end of
    vhost_vdpa_net_cvq_load())

Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:

-netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
         queues=8,x-svq=on

-device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
         romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
         ctrl_vlan=off,vectors=18,host_mtu=9000,
         disable-legacy=on,disable-modern=off

---

Configuration time:    ~31s
Dataplane enable time: ~0.14ms

>> If it's only about pre-heat, could dest qemu preheat with max num of
>> vqueues?  Is it the same cost of downtime when growing num of queues,
>> v.s. shrinking num of queues?
>>
> 
> Well you need to send the vq addresses and properties to preheat
> these. If the address is invalid, the destination device will
> interpret the vq address as the avail ring, for example, and will read
> an invalid avail idx.
> 
>> For softwares, is it about memory transaction updates due to the vqueues?
>> If so, have we investigated a more generic approach on memory side, likely
>> some form of continuation from Chuang's work I previously mentioned?
>>
> 
> This work is very interesting, and most of the downtime was because of
> memory pinning indeed. Thanks for bringing it up! But the downtime is
> not caused for the individual vq memory config, but for pinning all
> the guest's memory for the device to access to it.
> 
> I think it is worth exploring if it affects the downtime in the case
> of HW. I don't see any reason to reject that series but lack of
> reviews, isn't it?
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Eugenio Perez Martin 2 months, 4 weeks ago

On Fri, Aug 15, 2025 at 4:50 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>
>
>
> On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
> > On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
> >>
> >> On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
> >>> On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
> >>>>
> >>>> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
> >>>>> This effort was started to reduce the guest visible downtime by
> >>>>> virtio-net/vhost-net/vhost-vDPA during live migration, especially
> >>>>> vhost-vDPA.
> >>>>>
> >>>>> The downtime contributed by vhost-vDPA, for example, is not from having to
> >>>>> migrate a lot of state but rather expensive backend control-plane latency
> >>>>> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
> >>>>> settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
> >>>>> dominates its downtime.
> >>>>>
> >>>>> In other words, by migrating the state of virtio-net early (before the
> >>>>> stop-and-copy phase), we can also start staging backend configurations,
> >>>>> which is the main contributor of downtime when migrating a vhost-vDPA
> >>>>> device.
> >>>>>
> >>>>> I apologize if this series gives the impression that we're migrating a lot
> >>>>> of data here. It's more along the lines of moving control-plane latency out
> >>>>> of the stop-and-copy phase.
> >>>>
> >>>> I see, thanks.
> >>>>
> >>>> Please add these into the cover letter of the next post.  IMHO it's
> >>>> extremely important information to explain the real goal of this work.  I
> >>>> bet it is not expected for most people when reading the current cover
> >>>> letter.
> >>>>
> >>>> Then it could have nothing to do with iterative phase, am I right?
> >>>>
> >>>> What are the data needed for the dest QEMU to start staging backend
> >>>> configurations to the HWs underneath?  Does dest QEMU already have them in
> >>>> the cmdlines?
> >>>>
> >>>> Asking this because I want to know whether it can be done completely
> >>>> without src QEMU at all, e.g. when dest QEMU starts.
> >>>>
> >>>> If src QEMU's data is still needed, please also first consider providing
> >>>> such facility using an "early VMSD" if it is ever possible: feel free to
> >>>> refer to commit 3b95a71b22827d26178.
> >>>>
> >>>
> >>> While it works for this series, it does not allow to resend the state
> >>> when the src device changes. For example, if the number of virtqueues
> >>> is modified.
> >>
> >> Some explanation on "how sync number of vqueues helps downtime" would help.
> >> Not "it might preheat things", but exactly why, and how that differs when
> >> it's pure software, and when hardware will be involved.
> >>
> >
> > By nvidia engineers to configure vqs (number, size, RSS, etc) takes
> > about ~200ms:
> > https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$
> >
> > Adding Dragos here in case he can provide more details. Maybe the
> > numbers have changed though.
> >
> > And I guess the difference with pure SW will always come down to PCI
> > communications, which assume it is slower than configuring the host SW
> > device in RAM or even CPU cache. But I admin that proper profiling is
> > needed before making those claims.
> >
> > Jonah, can you print the time it takes to configure the vDPA device
> > with traces vs the time it takes to enable the dataplane of the
> > device? So we can get an idea of how much time we save with this.
> >
>
> Let me know if this isn't what you're looking for.
>
> I'm assuming by "configuration time" you mean:
>   - Time from device startup (entry to vhost_vdpa_dev_start()) to right
>     before we start enabling the vrings (e.g.
>     VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).
>
> And by "time taken to enable the dataplane" I'm assuming you mean:
>   - Time right before we start enabling the vrings (see above) to right
>     after we enable the last vring (at the end of
>     vhost_vdpa_net_cvq_load())
>
> Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:
>
> -netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
>          queues=8,x-svq=on
>
> -device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
>          romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
>          ctrl_vlan=off,vectors=18,host_mtu=9000,
>          disable-legacy=on,disable-modern=off
>
> ---
>
> Configuration time:    ~31s
> Dataplane enable time: ~0.14ms
>

I was vague, but yes, that's representative enough! It would be more
accurate if the configuration time ends by the time QEMU enables the
first queue of the dataplane though.

As Si-Wei mentions, is v->shared->listener_registered == true at the
beginning of vhost_vdpa_dev_start?

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Jonah Palmer 2 months, 4 weeks ago


On 8/18/25 2:51 AM, Eugenio Perez Martin wrote:
> On Fri, Aug 15, 2025 at 4:50 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>
>>
>>
>> On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
>>> On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
>>>>
>>>> On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
>>>>> On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
>>>>>>
>>>>>> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
>>>>>>> This effort was started to reduce the guest visible downtime by
>>>>>>> virtio-net/vhost-net/vhost-vDPA during live migration, especially
>>>>>>> vhost-vDPA.
>>>>>>>
>>>>>>> The downtime contributed by vhost-vDPA, for example, is not from having to
>>>>>>> migrate a lot of state but rather expensive backend control-plane latency
>>>>>>> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
>>>>>>> settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
>>>>>>> dominates its downtime.
>>>>>>>
>>>>>>> In other words, by migrating the state of virtio-net early (before the
>>>>>>> stop-and-copy phase), we can also start staging backend configurations,
>>>>>>> which is the main contributor of downtime when migrating a vhost-vDPA
>>>>>>> device.
>>>>>>>
>>>>>>> I apologize if this series gives the impression that we're migrating a lot
>>>>>>> of data here. It's more along the lines of moving control-plane latency out
>>>>>>> of the stop-and-copy phase.
>>>>>>
>>>>>> I see, thanks.
>>>>>>
>>>>>> Please add these into the cover letter of the next post.  IMHO it's
>>>>>> extremely important information to explain the real goal of this work.  I
>>>>>> bet it is not expected for most people when reading the current cover
>>>>>> letter.
>>>>>>
>>>>>> Then it could have nothing to do with iterative phase, am I right?
>>>>>>
>>>>>> What are the data needed for the dest QEMU to start staging backend
>>>>>> configurations to the HWs underneath?  Does dest QEMU already have them in
>>>>>> the cmdlines?
>>>>>>
>>>>>> Asking this because I want to know whether it can be done completely
>>>>>> without src QEMU at all, e.g. when dest QEMU starts.
>>>>>>
>>>>>> If src QEMU's data is still needed, please also first consider providing
>>>>>> such facility using an "early VMSD" if it is ever possible: feel free to
>>>>>> refer to commit 3b95a71b22827d26178.
>>>>>>
>>>>>
>>>>> While it works for this series, it does not allow to resend the state
>>>>> when the src device changes. For example, if the number of virtqueues
>>>>> is modified.
>>>>
>>>> Some explanation on "how sync number of vqueues helps downtime" would help.
>>>> Not "it might preheat things", but exactly why, and how that differs when
>>>> it's pure software, and when hardware will be involved.
>>>>
>>>
>>> By nvidia engineers to configure vqs (number, size, RSS, etc) takes
>>> about ~200ms:
>>> https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$
>>>
>>> Adding Dragos here in case he can provide more details. Maybe the
>>> numbers have changed though.
>>>
>>> And I guess the difference with pure SW will always come down to PCI
>>> communications, which assume it is slower than configuring the host SW
>>> device in RAM or even CPU cache. But I admin that proper profiling is
>>> needed before making those claims.
>>>
>>> Jonah, can you print the time it takes to configure the vDPA device
>>> with traces vs the time it takes to enable the dataplane of the
>>> device? So we can get an idea of how much time we save with this.
>>>
>>
>> Let me know if this isn't what you're looking for.
>>
>> I'm assuming by "configuration time" you mean:
>>    - Time from device startup (entry to vhost_vdpa_dev_start()) to right
>>      before we start enabling the vrings (e.g.
>>      VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).
>>
>> And by "time taken to enable the dataplane" I'm assuming you mean:
>>    - Time right before we start enabling the vrings (see above) to right
>>      after we enable the last vring (at the end of
>>      vhost_vdpa_net_cvq_load())
>>
>> Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:
>>
>> -netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
>>           queues=8,x-svq=on
>>
>> -device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
>>           romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
>>           ctrl_vlan=off,vectors=18,host_mtu=9000,
>>           disable-legacy=on,disable-modern=off
>>
>> ---
>>
>> Configuration time:    ~31s
>> Dataplane enable time: ~0.14ms
>>
> 
> I was vague, but yes, that's representative enough! It would be more
> accurate if the configuration time ends by the time QEMU enables the
> first queue of the dataplane though.
> 
> As Si-Wei mentions, is v->shared->listener_registered == true at the
> beginning of vhost_vdpa_dev_start?
> 

Ah, I also realized that Qemu I was using for measurements was using a 
version before the listener_registered member was introduced.

I retested with the latest changes in Qemu and set x-svq=off, e.g.: 
guest specs: 128G Mem, SVQ=off, CVQ=on, 8 queue pairs. I ran testing 3 
times for measurements.

v->shared->listener_registered == false at the beginning of 
vhost_vdpa_dev_start().

---

Configuration time: Time from first entry into vhost_vdpa_dev_start() to 
right after Qemu enables the first VQ.
  - 26.947s, 26.606s, 27.326s

Enable dataplane: Time from right after first VQ is enabled to right 
after the last VQ is enabled.
  - 0.081ms, 0.081ms, 0.079ms

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Eugenio Perez Martin 2 months, 4 weeks ago

On Mon, Aug 18, 2025 at 4:46 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>
>
>
> On 8/18/25 2:51 AM, Eugenio Perez Martin wrote:
> > On Fri, Aug 15, 2025 at 4:50 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
> >>
> >>
> >>
> >> On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
> >>> On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
> >>>>
> >>>> On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
> >>>>> On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
> >>>>>>
> >>>>>> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
> >>>>>>> This effort was started to reduce the guest visible downtime by
> >>>>>>> virtio-net/vhost-net/vhost-vDPA during live migration, especially
> >>>>>>> vhost-vDPA.
> >>>>>>>
> >>>>>>> The downtime contributed by vhost-vDPA, for example, is not from having to
> >>>>>>> migrate a lot of state but rather expensive backend control-plane latency
> >>>>>>> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
> >>>>>>> settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
> >>>>>>> dominates its downtime.
> >>>>>>>
> >>>>>>> In other words, by migrating the state of virtio-net early (before the
> >>>>>>> stop-and-copy phase), we can also start staging backend configurations,
> >>>>>>> which is the main contributor of downtime when migrating a vhost-vDPA
> >>>>>>> device.
> >>>>>>>
> >>>>>>> I apologize if this series gives the impression that we're migrating a lot
> >>>>>>> of data here. It's more along the lines of moving control-plane latency out
> >>>>>>> of the stop-and-copy phase.
> >>>>>>
> >>>>>> I see, thanks.
> >>>>>>
> >>>>>> Please add these into the cover letter of the next post.  IMHO it's
> >>>>>> extremely important information to explain the real goal of this work.  I
> >>>>>> bet it is not expected for most people when reading the current cover
> >>>>>> letter.
> >>>>>>
> >>>>>> Then it could have nothing to do with iterative phase, am I right?
> >>>>>>
> >>>>>> What are the data needed for the dest QEMU to start staging backend
> >>>>>> configurations to the HWs underneath?  Does dest QEMU already have them in
> >>>>>> the cmdlines?
> >>>>>>
> >>>>>> Asking this because I want to know whether it can be done completely
> >>>>>> without src QEMU at all, e.g. when dest QEMU starts.
> >>>>>>
> >>>>>> If src QEMU's data is still needed, please also first consider providing
> >>>>>> such facility using an "early VMSD" if it is ever possible: feel free to
> >>>>>> refer to commit 3b95a71b22827d26178.
> >>>>>>
> >>>>>
> >>>>> While it works for this series, it does not allow to resend the state
> >>>>> when the src device changes. For example, if the number of virtqueues
> >>>>> is modified.
> >>>>
> >>>> Some explanation on "how sync number of vqueues helps downtime" would help.
> >>>> Not "it might preheat things", but exactly why, and how that differs when
> >>>> it's pure software, and when hardware will be involved.
> >>>>
> >>>
> >>> By nvidia engineers to configure vqs (number, size, RSS, etc) takes
> >>> about ~200ms:
> >>> https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$
> >>>
> >>> Adding Dragos here in case he can provide more details. Maybe the
> >>> numbers have changed though.
> >>>
> >>> And I guess the difference with pure SW will always come down to PCI
> >>> communications, which assume it is slower than configuring the host SW
> >>> device in RAM or even CPU cache. But I admin that proper profiling is
> >>> needed before making those claims.
> >>>
> >>> Jonah, can you print the time it takes to configure the vDPA device
> >>> with traces vs the time it takes to enable the dataplane of the
> >>> device? So we can get an idea of how much time we save with this.
> >>>
> >>
> >> Let me know if this isn't what you're looking for.
> >>
> >> I'm assuming by "configuration time" you mean:
> >>    - Time from device startup (entry to vhost_vdpa_dev_start()) to right
> >>      before we start enabling the vrings (e.g.
> >>      VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).
> >>
> >> And by "time taken to enable the dataplane" I'm assuming you mean:
> >>    - Time right before we start enabling the vrings (see above) to right
> >>      after we enable the last vring (at the end of
> >>      vhost_vdpa_net_cvq_load())
> >>
> >> Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:
> >>
> >> -netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
> >>           queues=8,x-svq=on
> >>
> >> -device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
> >>           romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
> >>           ctrl_vlan=off,vectors=18,host_mtu=9000,
> >>           disable-legacy=on,disable-modern=off
> >>
> >> ---
> >>
> >> Configuration time:    ~31s
> >> Dataplane enable time: ~0.14ms
> >>
> >
> > I was vague, but yes, that's representative enough! It would be more
> > accurate if the configuration time ends by the time QEMU enables the
> > first queue of the dataplane though.
> >
> > As Si-Wei mentions, is v->shared->listener_registered == true at the
> > beginning of vhost_vdpa_dev_start?
> >
>
> Ah, I also realized that Qemu I was using for measurements was using a
> version before the listener_registered member was introduced.
>
> I retested with the latest changes in Qemu and set x-svq=off, e.g.:
> guest specs: 128G Mem, SVQ=off, CVQ=on, 8 queue pairs. I ran testing 3
> times for measurements.
>
> v->shared->listener_registered == false at the beginning of
> vhost_vdpa_dev_start().
>

Let's move out the effect of the mem pinning from the downtime by
registering the listener before the migration. Can you check why is it
not registered at vhost_vdpa_set_owner?

> ---
>
> Configuration time: Time from first entry into vhost_vdpa_dev_start() to
> right after Qemu enables the first VQ.
>   - 26.947s, 26.606s, 27.326s
>
> Enable dataplane: Time from right after first VQ is enabled to right
> after the last VQ is enabled.
>   - 0.081ms, 0.081ms, 0.079ms
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Jonah Palmer 2 months, 3 weeks ago


On 8/19/25 3:10 AM, Eugenio Perez Martin wrote:
> On Mon, Aug 18, 2025 at 4:46 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>
>>
>>
>> On 8/18/25 2:51 AM, Eugenio Perez Martin wrote:
>>> On Fri, Aug 15, 2025 at 4:50 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>>>
>>>>
>>>>
>>>> On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
>>>>> On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
>>>>>>
>>>>>> On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
>>>>>>> On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
>>>>>>>>
>>>>>>>> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
>>>>>>>>> This effort was started to reduce the guest visible downtime by
>>>>>>>>> virtio-net/vhost-net/vhost-vDPA during live migration, especially
>>>>>>>>> vhost-vDPA.
>>>>>>>>>
>>>>>>>>> The downtime contributed by vhost-vDPA, for example, is not from having to
>>>>>>>>> migrate a lot of state but rather expensive backend control-plane latency
>>>>>>>>> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
>>>>>>>>> settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
>>>>>>>>> dominates its downtime.
>>>>>>>>>
>>>>>>>>> In other words, by migrating the state of virtio-net early (before the
>>>>>>>>> stop-and-copy phase), we can also start staging backend configurations,
>>>>>>>>> which is the main contributor of downtime when migrating a vhost-vDPA
>>>>>>>>> device.
>>>>>>>>>
>>>>>>>>> I apologize if this series gives the impression that we're migrating a lot
>>>>>>>>> of data here. It's more along the lines of moving control-plane latency out
>>>>>>>>> of the stop-and-copy phase.
>>>>>>>>
>>>>>>>> I see, thanks.
>>>>>>>>
>>>>>>>> Please add these into the cover letter of the next post.  IMHO it's
>>>>>>>> extremely important information to explain the real goal of this work.  I
>>>>>>>> bet it is not expected for most people when reading the current cover
>>>>>>>> letter.
>>>>>>>>
>>>>>>>> Then it could have nothing to do with iterative phase, am I right?
>>>>>>>>
>>>>>>>> What are the data needed for the dest QEMU to start staging backend
>>>>>>>> configurations to the HWs underneath?  Does dest QEMU already have them in
>>>>>>>> the cmdlines?
>>>>>>>>
>>>>>>>> Asking this because I want to know whether it can be done completely
>>>>>>>> without src QEMU at all, e.g. when dest QEMU starts.
>>>>>>>>
>>>>>>>> If src QEMU's data is still needed, please also first consider providing
>>>>>>>> such facility using an "early VMSD" if it is ever possible: feel free to
>>>>>>>> refer to commit 3b95a71b22827d26178.
>>>>>>>>
>>>>>>>
>>>>>>> While it works for this series, it does not allow to resend the state
>>>>>>> when the src device changes. For example, if the number of virtqueues
>>>>>>> is modified.
>>>>>>
>>>>>> Some explanation on "how sync number of vqueues helps downtime" would help.
>>>>>> Not "it might preheat things", but exactly why, and how that differs when
>>>>>> it's pure software, and when hardware will be involved.
>>>>>>
>>>>>
>>>>> By nvidia engineers to configure vqs (number, size, RSS, etc) takes
>>>>> about ~200ms:
>>>>> https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$
>>>>>
>>>>> Adding Dragos here in case he can provide more details. Maybe the
>>>>> numbers have changed though.
>>>>>
>>>>> And I guess the difference with pure SW will always come down to PCI
>>>>> communications, which assume it is slower than configuring the host SW
>>>>> device in RAM or even CPU cache. But I admin that proper profiling is
>>>>> needed before making those claims.
>>>>>
>>>>> Jonah, can you print the time it takes to configure the vDPA device
>>>>> with traces vs the time it takes to enable the dataplane of the
>>>>> device? So we can get an idea of how much time we save with this.
>>>>>
>>>>
>>>> Let me know if this isn't what you're looking for.
>>>>
>>>> I'm assuming by "configuration time" you mean:
>>>>     - Time from device startup (entry to vhost_vdpa_dev_start()) to right
>>>>       before we start enabling the vrings (e.g.
>>>>       VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).
>>>>
>>>> And by "time taken to enable the dataplane" I'm assuming you mean:
>>>>     - Time right before we start enabling the vrings (see above) to right
>>>>       after we enable the last vring (at the end of
>>>>       vhost_vdpa_net_cvq_load())
>>>>
>>>> Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:
>>>>
>>>> -netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
>>>>            queues=8,x-svq=on
>>>>
>>>> -device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
>>>>            romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
>>>>            ctrl_vlan=off,vectors=18,host_mtu=9000,
>>>>            disable-legacy=on,disable-modern=off
>>>>
>>>> ---
>>>>
>>>> Configuration time:    ~31s
>>>> Dataplane enable time: ~0.14ms
>>>>
>>>
>>> I was vague, but yes, that's representative enough! It would be more
>>> accurate if the configuration time ends by the time QEMU enables the
>>> first queue of the dataplane though.
>>>
>>> As Si-Wei mentions, is v->shared->listener_registered == true at the
>>> beginning of vhost_vdpa_dev_start?
>>>
>>
>> Ah, I also realized that Qemu I was using for measurements was using a
>> version before the listener_registered member was introduced.
>>
>> I retested with the latest changes in Qemu and set x-svq=off, e.g.:
>> guest specs: 128G Mem, SVQ=off, CVQ=on, 8 queue pairs. I ran testing 3
>> times for measurements.
>>
>> v->shared->listener_registered == false at the beginning of
>> vhost_vdpa_dev_start().
>>
> 
> Let's move out the effect of the mem pinning from the downtime by
> registering the listener before the migration. Can you check why is it
> not registered at vhost_vdpa_set_owner?
> 

Sorry I was profiling improperly. The listener is registered at 
vhost_vdpa_set_owner initially and v->shared->listener_registered is set 
to true, but once we reach the first vhost_vdpa_dev_start call, it shows 
as false and is re-registered later in the function.

Should we always expect listener_registered == true at every 
vhost_vdpa_dev_start call during startup? This is what I traced during 
startup of a single guest (no migration). Tracepoint is right at the 
start of the vhost_vdpa_dev_start function:

vhost_vdpa_set_owner() - register memory listener
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
...
* VQs are now being enabled *

I'm also seeing that when the guest is being shutdown, 
dev->vhost_ops->vhost_get_vring_base() is failing in 
do_vhost_virtqueue_stop():

...
[  114.718429] systemd-shutdown[1]: Syncing filesystems and block devices.
[  114.719255] systemd-shutdown[1]: Powering off.
[  114.719916] sd 0:0:0:0: [sda] Synchronizing SCSI cache
[  114.724826] ACPI: PM: Preparing to enter system sleep state S5
[  114.725593] reboot: Power down
vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
qemu-system-x86_64: vhost VQ 2 ring restore failed: -1: Operation not 
permitted (1)
qemu-system-x86_64: vhost VQ 3 ring restore failed: -1: Operation not 
permitted (1)
vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
qemu-system-x86_64: vhost VQ 4 ring restore failed: -1: Operation not 
permitted (1)
qemu-system-x86_64: vhost VQ 5 ring restore failed: -1: Operation not 
permitted (1)
vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
qemu-system-x86_64: vhost VQ 6 ring restore failed: -1: Operation not 
permitted (1)
qemu-system-x86_64: vhost VQ 7 ring restore failed: -1: Operation not 
permitted (1)
vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
qemu-system-x86_64: vhost VQ 8 ring restore failed: -1: Operation not 
permitted (1)
qemu-system-x86_64: vhost VQ 9 ring restore failed: -1: Operation not 
permitted (1)
vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
qemu-system-x86_64: vhost VQ 10 ring restore failed: -1: Operation not 
permitted (1)
qemu-system-x86_64: vhost VQ 11 ring restore failed: -1: Operation not 
permitted (1)
vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
qemu-system-x86_64: vhost VQ 12 ring restore failed: -1: Operation not 
permitted (1)
qemu-system-x86_64: vhost VQ 13 ring restore failed: -1: Operation not 
permitted (1)
vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
qemu-system-x86_64: vhost VQ 14 ring restore failed: -1: Operation not 
permitted (1)
qemu-system-x86_64: vhost VQ 15 ring restore failed: -1: Operation not 
permitted (1)
vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0

However when x-svq=on, I don't see these errors on shutdown.

>> ---
>>
>> Configuration time: Time from first entry into vhost_vdpa_dev_start() to
>> right after Qemu enables the first VQ.
>>    - 26.947s, 26.606s, 27.326s
>>
>> Enable dataplane: Time from right after first VQ is enabled to right
>> after the last VQ is enabled.
>>    - 0.081ms, 0.081ms, 0.079ms
>>
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Eugenio Perez Martin 2 months, 3 weeks ago

On Tue, Aug 19, 2025 at 5:11 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>
>
>
> On 8/19/25 3:10 AM, Eugenio Perez Martin wrote:
> > On Mon, Aug 18, 2025 at 4:46 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
> >>
> >>
> >>
> >> On 8/18/25 2:51 AM, Eugenio Perez Martin wrote:
> >>> On Fri, Aug 15, 2025 at 4:50 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
> >>>>> On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
> >>>>>>
> >>>>>> On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
> >>>>>>> On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
> >>>>>>>>
> >>>>>>>> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
> >>>>>>>>> This effort was started to reduce the guest visible downtime by
> >>>>>>>>> virtio-net/vhost-net/vhost-vDPA during live migration, especially
> >>>>>>>>> vhost-vDPA.
> >>>>>>>>>
> >>>>>>>>> The downtime contributed by vhost-vDPA, for example, is not from having to
> >>>>>>>>> migrate a lot of state but rather expensive backend control-plane latency
> >>>>>>>>> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
> >>>>>>>>> settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
> >>>>>>>>> dominates its downtime.
> >>>>>>>>>
> >>>>>>>>> In other words, by migrating the state of virtio-net early (before the
> >>>>>>>>> stop-and-copy phase), we can also start staging backend configurations,
> >>>>>>>>> which is the main contributor of downtime when migrating a vhost-vDPA
> >>>>>>>>> device.
> >>>>>>>>>
> >>>>>>>>> I apologize if this series gives the impression that we're migrating a lot
> >>>>>>>>> of data here. It's more along the lines of moving control-plane latency out
> >>>>>>>>> of the stop-and-copy phase.
> >>>>>>>>
> >>>>>>>> I see, thanks.
> >>>>>>>>
> >>>>>>>> Please add these into the cover letter of the next post.  IMHO it's
> >>>>>>>> extremely important information to explain the real goal of this work.  I
> >>>>>>>> bet it is not expected for most people when reading the current cover
> >>>>>>>> letter.
> >>>>>>>>
> >>>>>>>> Then it could have nothing to do with iterative phase, am I right?
> >>>>>>>>
> >>>>>>>> What are the data needed for the dest QEMU to start staging backend
> >>>>>>>> configurations to the HWs underneath?  Does dest QEMU already have them in
> >>>>>>>> the cmdlines?
> >>>>>>>>
> >>>>>>>> Asking this because I want to know whether it can be done completely
> >>>>>>>> without src QEMU at all, e.g. when dest QEMU starts.
> >>>>>>>>
> >>>>>>>> If src QEMU's data is still needed, please also first consider providing
> >>>>>>>> such facility using an "early VMSD" if it is ever possible: feel free to
> >>>>>>>> refer to commit 3b95a71b22827d26178.
> >>>>>>>>
> >>>>>>>
> >>>>>>> While it works for this series, it does not allow to resend the state
> >>>>>>> when the src device changes. For example, if the number of virtqueues
> >>>>>>> is modified.
> >>>>>>
> >>>>>> Some explanation on "how sync number of vqueues helps downtime" would help.
> >>>>>> Not "it might preheat things", but exactly why, and how that differs when
> >>>>>> it's pure software, and when hardware will be involved.
> >>>>>>
> >>>>>
> >>>>> By nvidia engineers to configure vqs (number, size, RSS, etc) takes
> >>>>> about ~200ms:
> >>>>> https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$
> >>>>>
> >>>>> Adding Dragos here in case he can provide more details. Maybe the
> >>>>> numbers have changed though.
> >>>>>
> >>>>> And I guess the difference with pure SW will always come down to PCI
> >>>>> communications, which assume it is slower than configuring the host SW
> >>>>> device in RAM or even CPU cache. But I admin that proper profiling is
> >>>>> needed before making those claims.
> >>>>>
> >>>>> Jonah, can you print the time it takes to configure the vDPA device
> >>>>> with traces vs the time it takes to enable the dataplane of the
> >>>>> device? So we can get an idea of how much time we save with this.
> >>>>>
> >>>>
> >>>> Let me know if this isn't what you're looking for.
> >>>>
> >>>> I'm assuming by "configuration time" you mean:
> >>>>     - Time from device startup (entry to vhost_vdpa_dev_start()) to right
> >>>>       before we start enabling the vrings (e.g.
> >>>>       VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).
> >>>>
> >>>> And by "time taken to enable the dataplane" I'm assuming you mean:
> >>>>     - Time right before we start enabling the vrings (see above) to right
> >>>>       after we enable the last vring (at the end of
> >>>>       vhost_vdpa_net_cvq_load())
> >>>>
> >>>> Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:
> >>>>
> >>>> -netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
> >>>>            queues=8,x-svq=on
> >>>>
> >>>> -device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
> >>>>            romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
> >>>>            ctrl_vlan=off,vectors=18,host_mtu=9000,
> >>>>            disable-legacy=on,disable-modern=off
> >>>>
> >>>> ---
> >>>>
> >>>> Configuration time:    ~31s
> >>>> Dataplane enable time: ~0.14ms
> >>>>
> >>>
> >>> I was vague, but yes, that's representative enough! It would be more
> >>> accurate if the configuration time ends by the time QEMU enables the
> >>> first queue of the dataplane though.
> >>>
> >>> As Si-Wei mentions, is v->shared->listener_registered == true at the
> >>> beginning of vhost_vdpa_dev_start?
> >>>
> >>
> >> Ah, I also realized that Qemu I was using for measurements was using a
> >> version before the listener_registered member was introduced.
> >>
> >> I retested with the latest changes in Qemu and set x-svq=off, e.g.:
> >> guest specs: 128G Mem, SVQ=off, CVQ=on, 8 queue pairs. I ran testing 3
> >> times for measurements.
> >>
> >> v->shared->listener_registered == false at the beginning of
> >> vhost_vdpa_dev_start().
> >>
> >
> > Let's move out the effect of the mem pinning from the downtime by
> > registering the listener before the migration. Can you check why is it
> > not registered at vhost_vdpa_set_owner?
> >
>
> Sorry I was profiling improperly. The listener is registered at
> vhost_vdpa_set_owner initially and v->shared->listener_registered is set
> to true, but once we reach the first vhost_vdpa_dev_start call, it shows
> as false and is re-registered later in the function.
>
> Should we always expect listener_registered == true at every
> vhost_vdpa_dev_start call during startup?

Yes, that leaves all the memory pinning time out of the downtime.

> This is what I traced during
> startup of a single guest (no migration).

We can trace the destination's QEMU to be more accurate, but probably
it makes no difference.

> Tracepoint is right at the
> start of the vhost_vdpa_dev_start function:
>
> vhost_vdpa_set_owner() - register memory listener
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1

This is surprising. Can you trace how listener_registered goes to 0 again?

> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> ...
> * VQs are now being enabled *
>
> I'm also seeing that when the guest is being shutdown,
> dev->vhost_ops->vhost_get_vring_base() is failing in
> do_vhost_virtqueue_stop():
>
> ...
> [  114.718429] systemd-shutdown[1]: Syncing filesystems and block devices.
> [  114.719255] systemd-shutdown[1]: Powering off.
> [  114.719916] sd 0:0:0:0: [sda] Synchronizing SCSI cache
> [  114.724826] ACPI: PM: Preparing to enter system sleep state S5
> [  114.725593] reboot: Power down
> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> qemu-system-x86_64: vhost VQ 2 ring restore failed: -1: Operation not
> permitted (1)
> qemu-system-x86_64: vhost VQ 3 ring restore failed: -1: Operation not
> permitted (1)
> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> qemu-system-x86_64: vhost VQ 4 ring restore failed: -1: Operation not
> permitted (1)
> qemu-system-x86_64: vhost VQ 5 ring restore failed: -1: Operation not
> permitted (1)
> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> qemu-system-x86_64: vhost VQ 6 ring restore failed: -1: Operation not
> permitted (1)
> qemu-system-x86_64: vhost VQ 7 ring restore failed: -1: Operation not
> permitted (1)
> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> qemu-system-x86_64: vhost VQ 8 ring restore failed: -1: Operation not
> permitted (1)
> qemu-system-x86_64: vhost VQ 9 ring restore failed: -1: Operation not
> permitted (1)
> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> qemu-system-x86_64: vhost VQ 10 ring restore failed: -1: Operation not
> permitted (1)
> qemu-system-x86_64: vhost VQ 11 ring restore failed: -1: Operation not
> permitted (1)
> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> qemu-system-x86_64: vhost VQ 12 ring restore failed: -1: Operation not
> permitted (1)
> qemu-system-x86_64: vhost VQ 13 ring restore failed: -1: Operation not
> permitted (1)
> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> qemu-system-x86_64: vhost VQ 14 ring restore failed: -1: Operation not
> permitted (1)
> qemu-system-x86_64: vhost VQ 15 ring restore failed: -1: Operation not
> permitted (1)
> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>
> However when x-svq=on, I don't see these errors on shutdown.
>

SVQ can mask this error as it does not need to forward the ring
restore message to the device. It can just start with 0 and convert
indexes.

Let's focus on listened_registered first :).

> >> ---
> >>
> >> Configuration time: Time from first entry into vhost_vdpa_dev_start() to
> >> right after Qemu enables the first VQ.
> >>    - 26.947s, 26.606s, 27.326s
> >>
> >> Enable dataplane: Time from right after first VQ is enabled to right
> >> after the last VQ is enabled.
> >>    - 0.081ms, 0.081ms, 0.079ms
> >>
> >
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Jonah Palmer 2 months, 2 weeks ago


On 8/20/25 3:59 AM, Eugenio Perez Martin wrote:
> On Tue, Aug 19, 2025 at 5:11 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>
>>
>>
>> On 8/19/25 3:10 AM, Eugenio Perez Martin wrote:
>>> On Mon, Aug 18, 2025 at 4:46 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>>>
>>>>
>>>>
>>>> On 8/18/25 2:51 AM, Eugenio Perez Martin wrote:
>>>>> On Fri, Aug 15, 2025 at 4:50 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
>>>>>>> On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
>>>>>>>>
>>>>>>>> On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
>>>>>>>>> On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
>>>>>>>>>>
>>>>>>>>>> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
>>>>>>>>>>> This effort was started to reduce the guest visible downtime by
>>>>>>>>>>> virtio-net/vhost-net/vhost-vDPA during live migration, especially
>>>>>>>>>>> vhost-vDPA.
>>>>>>>>>>>
>>>>>>>>>>> The downtime contributed by vhost-vDPA, for example, is not from having to
>>>>>>>>>>> migrate a lot of state but rather expensive backend control-plane latency
>>>>>>>>>>> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
>>>>>>>>>>> settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
>>>>>>>>>>> dominates its downtime.
>>>>>>>>>>>
>>>>>>>>>>> In other words, by migrating the state of virtio-net early (before the
>>>>>>>>>>> stop-and-copy phase), we can also start staging backend configurations,
>>>>>>>>>>> which is the main contributor of downtime when migrating a vhost-vDPA
>>>>>>>>>>> device.
>>>>>>>>>>>
>>>>>>>>>>> I apologize if this series gives the impression that we're migrating a lot
>>>>>>>>>>> of data here. It's more along the lines of moving control-plane latency out
>>>>>>>>>>> of the stop-and-copy phase.
>>>>>>>>>>
>>>>>>>>>> I see, thanks.
>>>>>>>>>>
>>>>>>>>>> Please add these into the cover letter of the next post.  IMHO it's
>>>>>>>>>> extremely important information to explain the real goal of this work.  I
>>>>>>>>>> bet it is not expected for most people when reading the current cover
>>>>>>>>>> letter.
>>>>>>>>>>
>>>>>>>>>> Then it could have nothing to do with iterative phase, am I right?
>>>>>>>>>>
>>>>>>>>>> What are the data needed for the dest QEMU to start staging backend
>>>>>>>>>> configurations to the HWs underneath?  Does dest QEMU already have them in
>>>>>>>>>> the cmdlines?
>>>>>>>>>>
>>>>>>>>>> Asking this because I want to know whether it can be done completely
>>>>>>>>>> without src QEMU at all, e.g. when dest QEMU starts.
>>>>>>>>>>
>>>>>>>>>> If src QEMU's data is still needed, please also first consider providing
>>>>>>>>>> such facility using an "early VMSD" if it is ever possible: feel free to
>>>>>>>>>> refer to commit 3b95a71b22827d26178.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> While it works for this series, it does not allow to resend the state
>>>>>>>>> when the src device changes. For example, if the number of virtqueues
>>>>>>>>> is modified.
>>>>>>>>
>>>>>>>> Some explanation on "how sync number of vqueues helps downtime" would help.
>>>>>>>> Not "it might preheat things", but exactly why, and how that differs when
>>>>>>>> it's pure software, and when hardware will be involved.
>>>>>>>>
>>>>>>>
>>>>>>> By nvidia engineers to configure vqs (number, size, RSS, etc) takes
>>>>>>> about ~200ms:
>>>>>>> https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$
>>>>>>>
>>>>>>> Adding Dragos here in case he can provide more details. Maybe the
>>>>>>> numbers have changed though.
>>>>>>>
>>>>>>> And I guess the difference with pure SW will always come down to PCI
>>>>>>> communications, which assume it is slower than configuring the host SW
>>>>>>> device in RAM or even CPU cache. But I admin that proper profiling is
>>>>>>> needed before making those claims.
>>>>>>>
>>>>>>> Jonah, can you print the time it takes to configure the vDPA device
>>>>>>> with traces vs the time it takes to enable the dataplane of the
>>>>>>> device? So we can get an idea of how much time we save with this.
>>>>>>>
>>>>>>
>>>>>> Let me know if this isn't what you're looking for.
>>>>>>
>>>>>> I'm assuming by "configuration time" you mean:
>>>>>>      - Time from device startup (entry to vhost_vdpa_dev_start()) to right
>>>>>>        before we start enabling the vrings (e.g.
>>>>>>        VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).
>>>>>>
>>>>>> And by "time taken to enable the dataplane" I'm assuming you mean:
>>>>>>      - Time right before we start enabling the vrings (see above) to right
>>>>>>        after we enable the last vring (at the end of
>>>>>>        vhost_vdpa_net_cvq_load())
>>>>>>
>>>>>> Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:
>>>>>>
>>>>>> -netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
>>>>>>             queues=8,x-svq=on
>>>>>>
>>>>>> -device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
>>>>>>             romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
>>>>>>             ctrl_vlan=off,vectors=18,host_mtu=9000,
>>>>>>             disable-legacy=on,disable-modern=off
>>>>>>
>>>>>> ---
>>>>>>
>>>>>> Configuration time:    ~31s
>>>>>> Dataplane enable time: ~0.14ms
>>>>>>
>>>>>
>>>>> I was vague, but yes, that's representative enough! It would be more
>>>>> accurate if the configuration time ends by the time QEMU enables the
>>>>> first queue of the dataplane though.
>>>>>
>>>>> As Si-Wei mentions, is v->shared->listener_registered == true at the
>>>>> beginning of vhost_vdpa_dev_start?
>>>>>
>>>>
>>>> Ah, I also realized that Qemu I was using for measurements was using a
>>>> version before the listener_registered member was introduced.
>>>>
>>>> I retested with the latest changes in Qemu and set x-svq=off, e.g.:
>>>> guest specs: 128G Mem, SVQ=off, CVQ=on, 8 queue pairs. I ran testing 3
>>>> times for measurements.
>>>>
>>>> v->shared->listener_registered == false at the beginning of
>>>> vhost_vdpa_dev_start().
>>>>
>>>
>>> Let's move out the effect of the mem pinning from the downtime by
>>> registering the listener before the migration. Can you check why is it
>>> not registered at vhost_vdpa_set_owner?
>>>
>>
>> Sorry I was profiling improperly. The listener is registered at
>> vhost_vdpa_set_owner initially and v->shared->listener_registered is set
>> to true, but once we reach the first vhost_vdpa_dev_start call, it shows
>> as false and is re-registered later in the function.
>>
>> Should we always expect listener_registered == true at every
>> vhost_vdpa_dev_start call during startup?
> 
> Yes, that leaves all the memory pinning time out of the downtime.
> 
>> This is what I traced during
>> startup of a single guest (no migration).
> 
> We can trace the destination's QEMU to be more accurate, but probably
> it makes no difference.
> 
>> Tracepoint is right at the
>> start of the vhost_vdpa_dev_start function:
>>
>> vhost_vdpa_set_owner() - register memory listener
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> 
> This is surprising. Can you trace how listener_registered goes to 0 again?
> 
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> ...
>> * VQs are now being enabled *
>>
>> I'm also seeing that when the guest is being shutdown,
>> dev->vhost_ops->vhost_get_vring_base() is failing in
>> do_vhost_virtqueue_stop():
>>
>> ...
>> [  114.718429] systemd-shutdown[1]: Syncing filesystems and block devices.
>> [  114.719255] systemd-shutdown[1]: Powering off.
>> [  114.719916] sd 0:0:0:0: [sda] Synchronizing SCSI cache
>> [  114.724826] ACPI: PM: Preparing to enter system sleep state S5
>> [  114.725593] reboot: Power down
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 2 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 3 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 4 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 5 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 6 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 7 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 8 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 9 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 10 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 11 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 12 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 13 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 14 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 15 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>>
>> However when x-svq=on, I don't see these errors on shutdown.
>>
> 
> SVQ can mask this error as it does not need to forward the ring
> restore message to the device. It can just start with 0 and convert
> indexes.
> 
> Let's focus on listened_registered first :).
> 
>>>> ---
>>>>
>>>> Configuration time: Time from first entry into vhost_vdpa_dev_start() to
>>>> right after Qemu enables the first VQ.
>>>>     - 26.947s, 26.606s, 27.326s
>>>>
>>>> Enable dataplane: Time from right after first VQ is enabled to right
>>>> after the last VQ is enabled.
>>>>     - 0.081ms, 0.081ms, 0.079ms
>>>>
>>>
>>
> 

I looked into this a bit more and realized I was being naive thinking 
that the vhost-vDPA device startup path of a single VM would be the same 
as that on a destination VM during live migration. This is **not** the 
case and I apologize for the confusion I caused.

What I described and profiled above is indeed true for the startup of a 
single VM / source VM with a vhost-vDPA device. However, this is not 
true on the destination side and its configuration time is drastically 
different.

Under the same specs, but now with a live migration performed between a 
source and destination VM (128G Mem, SVQ=off, CVQ=on, 8 queue pairs), 
and using the same tracepoints to find the configuration time and enable 
dataplane time, these are the measurements I found for the **destination 
VM**:

Configuration time: Time from first entry into vhost_vdpa_dev_start to 
right after Qemu enables the first VQ.
    - 268.603ms, 241.515ms, 249.007ms

Enable dataplane time: Time from right after the first VQ is enabled to 
right after the last VQ is enabled.
    - 0.072ms, 0.071ms, 0.070ms

---

For those curious, using the same printouts as I did above, this is what 
it actually looks like on the destination side:

* Destination VM is started *

vhost_vdpa_set_owner() - register memory listener
vhost_vdpa_reset_device() - unregistering listener

* Start live migration on source VM *
(qemu) migrate unix:/tmp/lm.sock
...

vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
vhost_vdpa_dev_start() - register listener

And this is very different than the churning we saw in my previous email 
that happens on the source / single guest VM with vhost-vDPA and its 
startup path.

---

Again, apologies on the confusion this caused. This was my fault for not 
being more careful.

Jonah

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Eugenio Perez Martin 2 months, 2 weeks ago

On Wed, Aug 27, 2025 at 6:56 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>
>
>
> On 8/20/25 3:59 AM, Eugenio Perez Martin wrote:
> > On Tue, Aug 19, 2025 at 5:11 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
> >>
> >>
> >>
> >> On 8/19/25 3:10 AM, Eugenio Perez Martin wrote:
> >>> On Mon, Aug 18, 2025 at 4:46 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 8/18/25 2:51 AM, Eugenio Perez Martin wrote:
> >>>>> On Fri, Aug 15, 2025 at 4:50 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
> >>>>>>> On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
> >>>>>>>>
> >>>>>>>> On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
> >>>>>>>>> On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
> >>>>>>>>>>
> >>>>>>>>>> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
> >>>>>>>>>>> This effort was started to reduce the guest visible downtime by
> >>>>>>>>>>> virtio-net/vhost-net/vhost-vDPA during live migration, especially
> >>>>>>>>>>> vhost-vDPA.
> >>>>>>>>>>>
> >>>>>>>>>>> The downtime contributed by vhost-vDPA, for example, is not from having to
> >>>>>>>>>>> migrate a lot of state but rather expensive backend control-plane latency
> >>>>>>>>>>> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
> >>>>>>>>>>> settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
> >>>>>>>>>>> dominates its downtime.
> >>>>>>>>>>>
> >>>>>>>>>>> In other words, by migrating the state of virtio-net early (before the
> >>>>>>>>>>> stop-and-copy phase), we can also start staging backend configurations,
> >>>>>>>>>>> which is the main contributor of downtime when migrating a vhost-vDPA
> >>>>>>>>>>> device.
> >>>>>>>>>>>
> >>>>>>>>>>> I apologize if this series gives the impression that we're migrating a lot
> >>>>>>>>>>> of data here. It's more along the lines of moving control-plane latency out
> >>>>>>>>>>> of the stop-and-copy phase.
> >>>>>>>>>>
> >>>>>>>>>> I see, thanks.
> >>>>>>>>>>
> >>>>>>>>>> Please add these into the cover letter of the next post.  IMHO it's
> >>>>>>>>>> extremely important information to explain the real goal of this work.  I
> >>>>>>>>>> bet it is not expected for most people when reading the current cover
> >>>>>>>>>> letter.
> >>>>>>>>>>
> >>>>>>>>>> Then it could have nothing to do with iterative phase, am I right?
> >>>>>>>>>>
> >>>>>>>>>> What are the data needed for the dest QEMU to start staging backend
> >>>>>>>>>> configurations to the HWs underneath?  Does dest QEMU already have them in
> >>>>>>>>>> the cmdlines?
> >>>>>>>>>>
> >>>>>>>>>> Asking this because I want to know whether it can be done completely
> >>>>>>>>>> without src QEMU at all, e.g. when dest QEMU starts.
> >>>>>>>>>>
> >>>>>>>>>> If src QEMU's data is still needed, please also first consider providing
> >>>>>>>>>> such facility using an "early VMSD" if it is ever possible: feel free to
> >>>>>>>>>> refer to commit 3b95a71b22827d26178.
> >>>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> While it works for this series, it does not allow to resend the state
> >>>>>>>>> when the src device changes. For example, if the number of virtqueues
> >>>>>>>>> is modified.
> >>>>>>>>
> >>>>>>>> Some explanation on "how sync number of vqueues helps downtime" would help.
> >>>>>>>> Not "it might preheat things", but exactly why, and how that differs when
> >>>>>>>> it's pure software, and when hardware will be involved.
> >>>>>>>>
> >>>>>>>
> >>>>>>> By nvidia engineers to configure vqs (number, size, RSS, etc) takes
> >>>>>>> about ~200ms:
> >>>>>>> https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$
> >>>>>>>
> >>>>>>> Adding Dragos here in case he can provide more details. Maybe the
> >>>>>>> numbers have changed though.
> >>>>>>>
> >>>>>>> And I guess the difference with pure SW will always come down to PCI
> >>>>>>> communications, which assume it is slower than configuring the host SW
> >>>>>>> device in RAM or even CPU cache. But I admin that proper profiling is
> >>>>>>> needed before making those claims.
> >>>>>>>
> >>>>>>> Jonah, can you print the time it takes to configure the vDPA device
> >>>>>>> with traces vs the time it takes to enable the dataplane of the
> >>>>>>> device? So we can get an idea of how much time we save with this.
> >>>>>>>
> >>>>>>
> >>>>>> Let me know if this isn't what you're looking for.
> >>>>>>
> >>>>>> I'm assuming by "configuration time" you mean:
> >>>>>>      - Time from device startup (entry to vhost_vdpa_dev_start()) to right
> >>>>>>        before we start enabling the vrings (e.g.
> >>>>>>        VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).
> >>>>>>
> >>>>>> And by "time taken to enable the dataplane" I'm assuming you mean:
> >>>>>>      - Time right before we start enabling the vrings (see above) to right
> >>>>>>        after we enable the last vring (at the end of
> >>>>>>        vhost_vdpa_net_cvq_load())
> >>>>>>
> >>>>>> Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:
> >>>>>>
> >>>>>> -netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
> >>>>>>             queues=8,x-svq=on
> >>>>>>
> >>>>>> -device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
> >>>>>>             romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
> >>>>>>             ctrl_vlan=off,vectors=18,host_mtu=9000,
> >>>>>>             disable-legacy=on,disable-modern=off
> >>>>>>
> >>>>>> ---
> >>>>>>
> >>>>>> Configuration time:    ~31s
> >>>>>> Dataplane enable time: ~0.14ms
> >>>>>>
> >>>>>
> >>>>> I was vague, but yes, that's representative enough! It would be more
> >>>>> accurate if the configuration time ends by the time QEMU enables the
> >>>>> first queue of the dataplane though.
> >>>>>
> >>>>> As Si-Wei mentions, is v->shared->listener_registered == true at the
> >>>>> beginning of vhost_vdpa_dev_start?
> >>>>>
> >>>>
> >>>> Ah, I also realized that Qemu I was using for measurements was using a
> >>>> version before the listener_registered member was introduced.
> >>>>
> >>>> I retested with the latest changes in Qemu and set x-svq=off, e.g.:
> >>>> guest specs: 128G Mem, SVQ=off, CVQ=on, 8 queue pairs. I ran testing 3
> >>>> times for measurements.
> >>>>
> >>>> v->shared->listener_registered == false at the beginning of
> >>>> vhost_vdpa_dev_start().
> >>>>
> >>>
> >>> Let's move out the effect of the mem pinning from the downtime by
> >>> registering the listener before the migration. Can you check why is it
> >>> not registered at vhost_vdpa_set_owner?
> >>>
> >>
> >> Sorry I was profiling improperly. The listener is registered at
> >> vhost_vdpa_set_owner initially and v->shared->listener_registered is set
> >> to true, but once we reach the first vhost_vdpa_dev_start call, it shows
> >> as false and is re-registered later in the function.
> >>
> >> Should we always expect listener_registered == true at every
> >> vhost_vdpa_dev_start call during startup?
> >
> > Yes, that leaves all the memory pinning time out of the downtime.
> >
> >> This is what I traced during
> >> startup of a single guest (no migration).
> >
> > We can trace the destination's QEMU to be more accurate, but probably
> > it makes no difference.
> >
> >> Tracepoint is right at the
> >> start of the vhost_vdpa_dev_start function:
> >>
> >> vhost_vdpa_set_owner() - register memory listener
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >
> > This is surprising. Can you trace how listener_registered goes to 0 again?
> >
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> ...
> >> * VQs are now being enabled *
> >>
> >> I'm also seeing that when the guest is being shutdown,
> >> dev->vhost_ops->vhost_get_vring_base() is failing in
> >> do_vhost_virtqueue_stop():
> >>
> >> ...
> >> [  114.718429] systemd-shutdown[1]: Syncing filesystems and block devices.
> >> [  114.719255] systemd-shutdown[1]: Powering off.
> >> [  114.719916] sd 0:0:0:0: [sda] Synchronizing SCSI cache
> >> [  114.724826] ACPI: PM: Preparing to enter system sleep state S5
> >> [  114.725593] reboot: Power down
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >> qemu-system-x86_64: vhost VQ 2 ring restore failed: -1: Operation not
> >> permitted (1)
> >> qemu-system-x86_64: vhost VQ 3 ring restore failed: -1: Operation not
> >> permitted (1)
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >> qemu-system-x86_64: vhost VQ 4 ring restore failed: -1: Operation not
> >> permitted (1)
> >> qemu-system-x86_64: vhost VQ 5 ring restore failed: -1: Operation not
> >> permitted (1)
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >> qemu-system-x86_64: vhost VQ 6 ring restore failed: -1: Operation not
> >> permitted (1)
> >> qemu-system-x86_64: vhost VQ 7 ring restore failed: -1: Operation not
> >> permitted (1)
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >> qemu-system-x86_64: vhost VQ 8 ring restore failed: -1: Operation not
> >> permitted (1)
> >> qemu-system-x86_64: vhost VQ 9 ring restore failed: -1: Operation not
> >> permitted (1)
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >> qemu-system-x86_64: vhost VQ 10 ring restore failed: -1: Operation not
> >> permitted (1)
> >> qemu-system-x86_64: vhost VQ 11 ring restore failed: -1: Operation not
> >> permitted (1)
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >> qemu-system-x86_64: vhost VQ 12 ring restore failed: -1: Operation not
> >> permitted (1)
> >> qemu-system-x86_64: vhost VQ 13 ring restore failed: -1: Operation not
> >> permitted (1)
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >> qemu-system-x86_64: vhost VQ 14 ring restore failed: -1: Operation not
> >> permitted (1)
> >> qemu-system-x86_64: vhost VQ 15 ring restore failed: -1: Operation not
> >> permitted (1)
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >>
> >> However when x-svq=on, I don't see these errors on shutdown.
> >>
> >
> > SVQ can mask this error as it does not need to forward the ring
> > restore message to the device. It can just start with 0 and convert
> > indexes.
> >
> > Let's focus on listened_registered first :).
> >
> >>>> ---
> >>>>
> >>>> Configuration time: Time from first entry into vhost_vdpa_dev_start() to
> >>>> right after Qemu enables the first VQ.
> >>>>     - 26.947s, 26.606s, 27.326s
> >>>>
> >>>> Enable dataplane: Time from right after first VQ is enabled to right
> >>>> after the last VQ is enabled.
> >>>>     - 0.081ms, 0.081ms, 0.079ms
> >>>>
> >>>
> >>
> >
>
> I looked into this a bit more and realized I was being naive thinking
> that the vhost-vDPA device startup path of a single VM would be the same
> as that on a destination VM during live migration. This is **not** the
> case and I apologize for the confusion I caused.
>
> What I described and profiled above is indeed true for the startup of a
> single VM / source VM with a vhost-vDPA device. However, this is not
> true on the destination side and its configuration time is drastically
> different.
>
> Under the same specs, but now with a live migration performed between a
> source and destination VM (128G Mem, SVQ=off, CVQ=on, 8 queue pairs),
> and using the same tracepoints to find the configuration time and enable
> dataplane time, these are the measurements I found for the **destination
> VM**:
>
> Configuration time: Time from first entry into vhost_vdpa_dev_start to
> right after Qemu enables the first VQ.
>     - 268.603ms, 241.515ms, 249.007ms
>
> Enable dataplane time: Time from right after the first VQ is enabled to
> right after the last VQ is enabled.
>     - 0.072ms, 0.071ms, 0.070ms
>
> ---
>
> For those curious, using the same printouts as I did above, this is what
> it actually looks like on the destination side:
>
> * Destination VM is started *
>
> vhost_vdpa_set_owner() - register memory listener
> vhost_vdpa_reset_device() - unregistering listener
>
> * Start live migration on source VM *
> (qemu) migrate unix:/tmp/lm.sock
> ...
>
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> vhost_vdpa_dev_start() - register listener
>

That's weird, can you check why the memory listener is not registered
at vhost_vdpa_set_owner? Or, if it is registered, why is it not
registered by the time vhost_vdpa_dev_start is called? This changes
the downtime a lot, more than half of the time is spent on this. So it
is worth fixing it before continuing.

> And this is very different than the churning we saw in my previous email
> that happens on the source / single guest VM with vhost-vDPA and its
> startup path.
>
> ---
>
> Again, apologies on the confusion this caused. This was my fault for not
> being more careful.
>

No worries!

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Jonah Palmer 2 months, 2 weeks ago


On 9/1/25 2:57 AM, Eugenio Perez Martin wrote:
> On Wed, Aug 27, 2025 at 6:56 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>
>>
>>
>> On 8/20/25 3:59 AM, Eugenio Perez Martin wrote:
>>> On Tue, Aug 19, 2025 at 5:11 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>>>
>>>>
>>>>
>>>> On 8/19/25 3:10 AM, Eugenio Perez Martin wrote:
>>>>> On Mon, Aug 18, 2025 at 4:46 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 8/18/25 2:51 AM, Eugenio Perez Martin wrote:
>>>>>>> On Fri, Aug 15, 2025 at 4:50 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
>>>>>>>>> On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
>>>>>>>>>>
>>>>>>>>>> On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
>>>>>>>>>>> On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
>>>>>>>>>>>>> This effort was started to reduce the guest visible downtime by
>>>>>>>>>>>>> virtio-net/vhost-net/vhost-vDPA during live migration, especially
>>>>>>>>>>>>> vhost-vDPA.
>>>>>>>>>>>>>
>>>>>>>>>>>>> The downtime contributed by vhost-vDPA, for example, is not from having to
>>>>>>>>>>>>> migrate a lot of state but rather expensive backend control-plane latency
>>>>>>>>>>>>> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
>>>>>>>>>>>>> settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
>>>>>>>>>>>>> dominates its downtime.
>>>>>>>>>>>>>
>>>>>>>>>>>>> In other words, by migrating the state of virtio-net early (before the
>>>>>>>>>>>>> stop-and-copy phase), we can also start staging backend configurations,
>>>>>>>>>>>>> which is the main contributor of downtime when migrating a vhost-vDPA
>>>>>>>>>>>>> device.
>>>>>>>>>>>>>
>>>>>>>>>>>>> I apologize if this series gives the impression that we're migrating a lot
>>>>>>>>>>>>> of data here. It's more along the lines of moving control-plane latency out
>>>>>>>>>>>>> of the stop-and-copy phase.
>>>>>>>>>>>>
>>>>>>>>>>>> I see, thanks.
>>>>>>>>>>>>
>>>>>>>>>>>> Please add these into the cover letter of the next post.  IMHO it's
>>>>>>>>>>>> extremely important information to explain the real goal of this work.  I
>>>>>>>>>>>> bet it is not expected for most people when reading the current cover
>>>>>>>>>>>> letter.
>>>>>>>>>>>>
>>>>>>>>>>>> Then it could have nothing to do with iterative phase, am I right?
>>>>>>>>>>>>
>>>>>>>>>>>> What are the data needed for the dest QEMU to start staging backend
>>>>>>>>>>>> configurations to the HWs underneath?  Does dest QEMU already have them in
>>>>>>>>>>>> the cmdlines?
>>>>>>>>>>>>
>>>>>>>>>>>> Asking this because I want to know whether it can be done completely
>>>>>>>>>>>> without src QEMU at all, e.g. when dest QEMU starts.
>>>>>>>>>>>>
>>>>>>>>>>>> If src QEMU's data is still needed, please also first consider providing
>>>>>>>>>>>> such facility using an "early VMSD" if it is ever possible: feel free to
>>>>>>>>>>>> refer to commit 3b95a71b22827d26178.
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> While it works for this series, it does not allow to resend the state
>>>>>>>>>>> when the src device changes. For example, if the number of virtqueues
>>>>>>>>>>> is modified.
>>>>>>>>>>
>>>>>>>>>> Some explanation on "how sync number of vqueues helps downtime" would help.
>>>>>>>>>> Not "it might preheat things", but exactly why, and how that differs when
>>>>>>>>>> it's pure software, and when hardware will be involved.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> By nvidia engineers to configure vqs (number, size, RSS, etc) takes
>>>>>>>>> about ~200ms:
>>>>>>>>> https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$
>>>>>>>>>
>>>>>>>>> Adding Dragos here in case he can provide more details. Maybe the
>>>>>>>>> numbers have changed though.
>>>>>>>>>
>>>>>>>>> And I guess the difference with pure SW will always come down to PCI
>>>>>>>>> communications, which assume it is slower than configuring the host SW
>>>>>>>>> device in RAM or even CPU cache. But I admin that proper profiling is
>>>>>>>>> needed before making those claims.
>>>>>>>>>
>>>>>>>>> Jonah, can you print the time it takes to configure the vDPA device
>>>>>>>>> with traces vs the time it takes to enable the dataplane of the
>>>>>>>>> device? So we can get an idea of how much time we save with this.
>>>>>>>>>
>>>>>>>>
>>>>>>>> Let me know if this isn't what you're looking for.
>>>>>>>>
>>>>>>>> I'm assuming by "configuration time" you mean:
>>>>>>>>       - Time from device startup (entry to vhost_vdpa_dev_start()) to right
>>>>>>>>         before we start enabling the vrings (e.g.
>>>>>>>>         VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).
>>>>>>>>
>>>>>>>> And by "time taken to enable the dataplane" I'm assuming you mean:
>>>>>>>>       - Time right before we start enabling the vrings (see above) to right
>>>>>>>>         after we enable the last vring (at the end of
>>>>>>>>         vhost_vdpa_net_cvq_load())
>>>>>>>>
>>>>>>>> Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:
>>>>>>>>
>>>>>>>> -netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
>>>>>>>>              queues=8,x-svq=on
>>>>>>>>
>>>>>>>> -device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
>>>>>>>>              romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
>>>>>>>>              ctrl_vlan=off,vectors=18,host_mtu=9000,
>>>>>>>>              disable-legacy=on,disable-modern=off
>>>>>>>>
>>>>>>>> ---
>>>>>>>>
>>>>>>>> Configuration time:    ~31s
>>>>>>>> Dataplane enable time: ~0.14ms
>>>>>>>>
>>>>>>>
>>>>>>> I was vague, but yes, that's representative enough! It would be more
>>>>>>> accurate if the configuration time ends by the time QEMU enables the
>>>>>>> first queue of the dataplane though.
>>>>>>>
>>>>>>> As Si-Wei mentions, is v->shared->listener_registered == true at the
>>>>>>> beginning of vhost_vdpa_dev_start?
>>>>>>>
>>>>>>
>>>>>> Ah, I also realized that Qemu I was using for measurements was using a
>>>>>> version before the listener_registered member was introduced.
>>>>>>
>>>>>> I retested with the latest changes in Qemu and set x-svq=off, e.g.:
>>>>>> guest specs: 128G Mem, SVQ=off, CVQ=on, 8 queue pairs. I ran testing 3
>>>>>> times for measurements.
>>>>>>
>>>>>> v->shared->listener_registered == false at the beginning of
>>>>>> vhost_vdpa_dev_start().
>>>>>>
>>>>>
>>>>> Let's move out the effect of the mem pinning from the downtime by
>>>>> registering the listener before the migration. Can you check why is it
>>>>> not registered at vhost_vdpa_set_owner?
>>>>>
>>>>
>>>> Sorry I was profiling improperly. The listener is registered at
>>>> vhost_vdpa_set_owner initially and v->shared->listener_registered is set
>>>> to true, but once we reach the first vhost_vdpa_dev_start call, it shows
>>>> as false and is re-registered later in the function.
>>>>
>>>> Should we always expect listener_registered == true at every
>>>> vhost_vdpa_dev_start call during startup?
>>>
>>> Yes, that leaves all the memory pinning time out of the downtime.
>>>
>>>> This is what I traced during
>>>> startup of a single guest (no migration).
>>>
>>> We can trace the destination's QEMU to be more accurate, but probably
>>> it makes no difference.
>>>
>>>> Tracepoint is right at the
>>>> start of the vhost_vdpa_dev_start function:
>>>>
>>>> vhost_vdpa_set_owner() - register memory listener
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>>>
>>> This is surprising. Can you trace how listener_registered goes to 0 again?
>>>
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>>>> ...
>>>> * VQs are now being enabled *
>>>>
>>>> I'm also seeing that when the guest is being shutdown,
>>>> dev->vhost_ops->vhost_get_vring_base() is failing in
>>>> do_vhost_virtqueue_stop():
>>>>
>>>> ...
>>>> [  114.718429] systemd-shutdown[1]: Syncing filesystems and block devices.
>>>> [  114.719255] systemd-shutdown[1]: Powering off.
>>>> [  114.719916] sd 0:0:0:0: [sda] Synchronizing SCSI cache
>>>> [  114.724826] ACPI: PM: Preparing to enter system sleep state S5
>>>> [  114.725593] reboot: Power down
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>>>> qemu-system-x86_64: vhost VQ 2 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> qemu-system-x86_64: vhost VQ 3 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>>>> qemu-system-x86_64: vhost VQ 4 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> qemu-system-x86_64: vhost VQ 5 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>>>> qemu-system-x86_64: vhost VQ 6 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> qemu-system-x86_64: vhost VQ 7 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>>>> qemu-system-x86_64: vhost VQ 8 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> qemu-system-x86_64: vhost VQ 9 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>>>> qemu-system-x86_64: vhost VQ 10 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> qemu-system-x86_64: vhost VQ 11 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>>>> qemu-system-x86_64: vhost VQ 12 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> qemu-system-x86_64: vhost VQ 13 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>>>> qemu-system-x86_64: vhost VQ 14 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> qemu-system-x86_64: vhost VQ 15 ring restore failed: -1: Operation not
>>>> permitted (1)
>>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>>>>
>>>> However when x-svq=on, I don't see these errors on shutdown.
>>>>
>>>
>>> SVQ can mask this error as it does not need to forward the ring
>>> restore message to the device. It can just start with 0 and convert
>>> indexes.
>>>
>>> Let's focus on listened_registered first :).
>>>
>>>>>> ---
>>>>>>
>>>>>> Configuration time: Time from first entry into vhost_vdpa_dev_start() to
>>>>>> right after Qemu enables the first VQ.
>>>>>>      - 26.947s, 26.606s, 27.326s
>>>>>>
>>>>>> Enable dataplane: Time from right after first VQ is enabled to right
>>>>>> after the last VQ is enabled.
>>>>>>      - 0.081ms, 0.081ms, 0.079ms
>>>>>>
>>>>>
>>>>
>>>
>>
>> I looked into this a bit more and realized I was being naive thinking
>> that the vhost-vDPA device startup path of a single VM would be the same
>> as that on a destination VM during live migration. This is **not** the
>> case and I apologize for the confusion I caused.
>>
>> What I described and profiled above is indeed true for the startup of a
>> single VM / source VM with a vhost-vDPA device. However, this is not
>> true on the destination side and its configuration time is drastically
>> different.
>>
>> Under the same specs, but now with a live migration performed between a
>> source and destination VM (128G Mem, SVQ=off, CVQ=on, 8 queue pairs),
>> and using the same tracepoints to find the configuration time and enable
>> dataplane time, these are the measurements I found for the **destination
>> VM**:
>>
>> Configuration time: Time from first entry into vhost_vdpa_dev_start to
>> right after Qemu enables the first VQ.
>>      - 268.603ms, 241.515ms, 249.007ms
>>
>> Enable dataplane time: Time from right after the first VQ is enabled to
>> right after the last VQ is enabled.
>>      - 0.072ms, 0.071ms, 0.070ms
>>
>> ---
>>
>> For those curious, using the same printouts as I did above, this is what
>> it actually looks like on the destination side:
>>
>> * Destination VM is started *
>>
>> vhost_vdpa_set_owner() - register memory listener
>> vhost_vdpa_reset_device() - unregistering listener
>>
>> * Start live migration on source VM *
>> (qemu) migrate unix:/tmp/lm.sock
>> ...
>>
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - register listener
>>
> 
> That's weird, can you check why the memory listener is not registered
> at vhost_vdpa_set_owner? Or, if it is registered, why is it not
> registered by the time vhost_vdpa_dev_start is called? This changes
> the downtime a lot, more than half of the time is spent on this. So it
> is worth fixing it before continuing.
> 

The memory listener is registered at vhost_vdpa_set_owner, but the 
reason we see v->shared->listener_registered == 0 by the time 
vhost_vdpa_dev_start is called is due to the vhost_vdpa_reset_device 
that's called shortly after.

But this re-registering is relatively quick compared to how long it 
takes during its initialization sequence.

>> And this is very different than the churning we saw in my previous email
>> that happens on the source / single guest VM with vhost-vDPA and its
>> startup path.
>>
>> ---
>>
>> Again, apologies on the confusion this caused. This was my fault for not
>> being more careful.
>>
> 
> No worries!
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Eugenio Perez Martin 2 months, 2 weeks ago

On Mon, Sep 1, 2025 at 3:17 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>
>
>
> On 9/1/25 2:57 AM, Eugenio Perez Martin wrote:
> > On Wed, Aug 27, 2025 at 6:56 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
> >>
> >>
> >>
> >> On 8/20/25 3:59 AM, Eugenio Perez Martin wrote:
> >>> On Tue, Aug 19, 2025 at 5:11 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 8/19/25 3:10 AM, Eugenio Perez Martin wrote:
> >>>>> On Mon, Aug 18, 2025 at 4:46 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> On 8/18/25 2:51 AM, Eugenio Perez Martin wrote:
> >>>>>>> On Fri, Aug 15, 2025 at 4:50 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
> >>>>>>>>
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
> >>>>>>>>> On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
> >>>>>>>>>>
> >>>>>>>>>> On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
> >>>>>>>>>>> On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
> >>>>>>>>>>>>
> >>>>>>>>>>>> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
> >>>>>>>>>>>>> This effort was started to reduce the guest visible downtime by
> >>>>>>>>>>>>> virtio-net/vhost-net/vhost-vDPA during live migration, especially
> >>>>>>>>>>>>> vhost-vDPA.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> The downtime contributed by vhost-vDPA, for example, is not from having to
> >>>>>>>>>>>>> migrate a lot of state but rather expensive backend control-plane latency
> >>>>>>>>>>>>> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
> >>>>>>>>>>>>> settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
> >>>>>>>>>>>>> dominates its downtime.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> In other words, by migrating the state of virtio-net early (before the
> >>>>>>>>>>>>> stop-and-copy phase), we can also start staging backend configurations,
> >>>>>>>>>>>>> which is the main contributor of downtime when migrating a vhost-vDPA
> >>>>>>>>>>>>> device.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> I apologize if this series gives the impression that we're migrating a lot
> >>>>>>>>>>>>> of data here. It's more along the lines of moving control-plane latency out
> >>>>>>>>>>>>> of the stop-and-copy phase.
> >>>>>>>>>>>>
> >>>>>>>>>>>> I see, thanks.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Please add these into the cover letter of the next post.  IMHO it's
> >>>>>>>>>>>> extremely important information to explain the real goal of this work.  I
> >>>>>>>>>>>> bet it is not expected for most people when reading the current cover
> >>>>>>>>>>>> letter.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Then it could have nothing to do with iterative phase, am I right?
> >>>>>>>>>>>>
> >>>>>>>>>>>> What are the data needed for the dest QEMU to start staging backend
> >>>>>>>>>>>> configurations to the HWs underneath?  Does dest QEMU already have them in
> >>>>>>>>>>>> the cmdlines?
> >>>>>>>>>>>>
> >>>>>>>>>>>> Asking this because I want to know whether it can be done completely
> >>>>>>>>>>>> without src QEMU at all, e.g. when dest QEMU starts.
> >>>>>>>>>>>>
> >>>>>>>>>>>> If src QEMU's data is still needed, please also first consider providing
> >>>>>>>>>>>> such facility using an "early VMSD" if it is ever possible: feel free to
> >>>>>>>>>>>> refer to commit 3b95a71b22827d26178.
> >>>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> While it works for this series, it does not allow to resend the state
> >>>>>>>>>>> when the src device changes. For example, if the number of virtqueues
> >>>>>>>>>>> is modified.
> >>>>>>>>>>
> >>>>>>>>>> Some explanation on "how sync number of vqueues helps downtime" would help.
> >>>>>>>>>> Not "it might preheat things", but exactly why, and how that differs when
> >>>>>>>>>> it's pure software, and when hardware will be involved.
> >>>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> By nvidia engineers to configure vqs (number, size, RSS, etc) takes
> >>>>>>>>> about ~200ms:
> >>>>>>>>> https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$
> >>>>>>>>>
> >>>>>>>>> Adding Dragos here in case he can provide more details. Maybe the
> >>>>>>>>> numbers have changed though.
> >>>>>>>>>
> >>>>>>>>> And I guess the difference with pure SW will always come down to PCI
> >>>>>>>>> communications, which assume it is slower than configuring the host SW
> >>>>>>>>> device in RAM or even CPU cache. But I admin that proper profiling is
> >>>>>>>>> needed before making those claims.
> >>>>>>>>>
> >>>>>>>>> Jonah, can you print the time it takes to configure the vDPA device
> >>>>>>>>> with traces vs the time it takes to enable the dataplane of the
> >>>>>>>>> device? So we can get an idea of how much time we save with this.
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>> Let me know if this isn't what you're looking for.
> >>>>>>>>
> >>>>>>>> I'm assuming by "configuration time" you mean:
> >>>>>>>>       - Time from device startup (entry to vhost_vdpa_dev_start()) to right
> >>>>>>>>         before we start enabling the vrings (e.g.
> >>>>>>>>         VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).
> >>>>>>>>
> >>>>>>>> And by "time taken to enable the dataplane" I'm assuming you mean:
> >>>>>>>>       - Time right before we start enabling the vrings (see above) to right
> >>>>>>>>         after we enable the last vring (at the end of
> >>>>>>>>         vhost_vdpa_net_cvq_load())
> >>>>>>>>
> >>>>>>>> Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:
> >>>>>>>>
> >>>>>>>> -netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
> >>>>>>>>              queues=8,x-svq=on
> >>>>>>>>
> >>>>>>>> -device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
> >>>>>>>>              romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
> >>>>>>>>              ctrl_vlan=off,vectors=18,host_mtu=9000,
> >>>>>>>>              disable-legacy=on,disable-modern=off
> >>>>>>>>
> >>>>>>>> ---
> >>>>>>>>
> >>>>>>>> Configuration time:    ~31s
> >>>>>>>> Dataplane enable time: ~0.14ms
> >>>>>>>>
> >>>>>>>
> >>>>>>> I was vague, but yes, that's representative enough! It would be more
> >>>>>>> accurate if the configuration time ends by the time QEMU enables the
> >>>>>>> first queue of the dataplane though.
> >>>>>>>
> >>>>>>> As Si-Wei mentions, is v->shared->listener_registered == true at the
> >>>>>>> beginning of vhost_vdpa_dev_start?
> >>>>>>>
> >>>>>>
> >>>>>> Ah, I also realized that Qemu I was using for measurements was using a
> >>>>>> version before the listener_registered member was introduced.
> >>>>>>
> >>>>>> I retested with the latest changes in Qemu and set x-svq=off, e.g.:
> >>>>>> guest specs: 128G Mem, SVQ=off, CVQ=on, 8 queue pairs. I ran testing 3
> >>>>>> times for measurements.
> >>>>>>
> >>>>>> v->shared->listener_registered == false at the beginning of
> >>>>>> vhost_vdpa_dev_start().
> >>>>>>
> >>>>>
> >>>>> Let's move out the effect of the mem pinning from the downtime by
> >>>>> registering the listener before the migration. Can you check why is it
> >>>>> not registered at vhost_vdpa_set_owner?
> >>>>>
> >>>>
> >>>> Sorry I was profiling improperly. The listener is registered at
> >>>> vhost_vdpa_set_owner initially and v->shared->listener_registered is set
> >>>> to true, but once we reach the first vhost_vdpa_dev_start call, it shows
> >>>> as false and is re-registered later in the function.
> >>>>
> >>>> Should we always expect listener_registered == true at every
> >>>> vhost_vdpa_dev_start call during startup?
> >>>
> >>> Yes, that leaves all the memory pinning time out of the downtime.
> >>>
> >>>> This is what I traced during
> >>>> startup of a single guest (no migration).
> >>>
> >>> We can trace the destination's QEMU to be more accurate, but probably
> >>> it makes no difference.
> >>>
> >>>> Tracepoint is right at the
> >>>> start of the vhost_vdpa_dev_start function:
> >>>>
> >>>> vhost_vdpa_set_owner() - register memory listener
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >>>
> >>> This is surprising. Can you trace how listener_registered goes to 0 again?
> >>>
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >>>> ...
> >>>> * VQs are now being enabled *
> >>>>
> >>>> I'm also seeing that when the guest is being shutdown,
> >>>> dev->vhost_ops->vhost_get_vring_base() is failing in
> >>>> do_vhost_virtqueue_stop():
> >>>>
> >>>> ...
> >>>> [  114.718429] systemd-shutdown[1]: Syncing filesystems and block devices.
> >>>> [  114.719255] systemd-shutdown[1]: Powering off.
> >>>> [  114.719916] sd 0:0:0:0: [sda] Synchronizing SCSI cache
> >>>> [  114.724826] ACPI: PM: Preparing to enter system sleep state S5
> >>>> [  114.725593] reboot: Power down
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >>>> qemu-system-x86_64: vhost VQ 2 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> qemu-system-x86_64: vhost VQ 3 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >>>> qemu-system-x86_64: vhost VQ 4 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> qemu-system-x86_64: vhost VQ 5 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >>>> qemu-system-x86_64: vhost VQ 6 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> qemu-system-x86_64: vhost VQ 7 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >>>> qemu-system-x86_64: vhost VQ 8 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> qemu-system-x86_64: vhost VQ 9 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >>>> qemu-system-x86_64: vhost VQ 10 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> qemu-system-x86_64: vhost VQ 11 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >>>> qemu-system-x86_64: vhost VQ 12 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> qemu-system-x86_64: vhost VQ 13 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >>>> qemu-system-x86_64: vhost VQ 14 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> qemu-system-x86_64: vhost VQ 15 ring restore failed: -1: Operation not
> >>>> permitted (1)
> >>>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
> >>>>
> >>>> However when x-svq=on, I don't see these errors on shutdown.
> >>>>
> >>>
> >>> SVQ can mask this error as it does not need to forward the ring
> >>> restore message to the device. It can just start with 0 and convert
> >>> indexes.
> >>>
> >>> Let's focus on listened_registered first :).
> >>>
> >>>>>> ---
> >>>>>>
> >>>>>> Configuration time: Time from first entry into vhost_vdpa_dev_start() to
> >>>>>> right after Qemu enables the first VQ.
> >>>>>>      - 26.947s, 26.606s, 27.326s
> >>>>>>
> >>>>>> Enable dataplane: Time from right after first VQ is enabled to right
> >>>>>> after the last VQ is enabled.
> >>>>>>      - 0.081ms, 0.081ms, 0.079ms
> >>>>>>
> >>>>>
> >>>>
> >>>
> >>
> >> I looked into this a bit more and realized I was being naive thinking
> >> that the vhost-vDPA device startup path of a single VM would be the same
> >> as that on a destination VM during live migration. This is **not** the
> >> case and I apologize for the confusion I caused.
> >>
> >> What I described and profiled above is indeed true for the startup of a
> >> single VM / source VM with a vhost-vDPA device. However, this is not
> >> true on the destination side and its configuration time is drastically
> >> different.
> >>
> >> Under the same specs, but now with a live migration performed between a
> >> source and destination VM (128G Mem, SVQ=off, CVQ=on, 8 queue pairs),
> >> and using the same tracepoints to find the configuration time and enable
> >> dataplane time, these are the measurements I found for the **destination
> >> VM**:
> >>
> >> Configuration time: Time from first entry into vhost_vdpa_dev_start to
> >> right after Qemu enables the first VQ.
> >>      - 268.603ms, 241.515ms, 249.007ms
> >>
> >> Enable dataplane time: Time from right after the first VQ is enabled to
> >> right after the last VQ is enabled.
> >>      - 0.072ms, 0.071ms, 0.070ms
> >>
> >> ---
> >>
> >> For those curious, using the same printouts as I did above, this is what
> >> it actually looks like on the destination side:
> >>
> >> * Destination VM is started *
> >>
> >> vhost_vdpa_set_owner() - register memory listener
> >> vhost_vdpa_reset_device() - unregistering listener
> >>
> >> * Start live migration on source VM *
> >> (qemu) migrate unix:/tmp/lm.sock
> >> ...
> >>
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> >> vhost_vdpa_dev_start() - register listener
> >>
> >
> > That's weird, can you check why the memory listener is not registered
> > at vhost_vdpa_set_owner? Or, if it is registered, why is it not
> > registered by the time vhost_vdpa_dev_start is called? This changes
> > the downtime a lot, more than half of the time is spent on this. So it
> > is worth fixing it before continuing.
> >
>
> The memory listener is registered at vhost_vdpa_set_owner, but the
> reason we see v->shared->listener_registered == 0 by the time
> vhost_vdpa_dev_start is called is due to the vhost_vdpa_reset_device
> that's called shortly after.
>

Ok, I missed the status of this.

This first reset is avoidable actually. I see two routes for this:
1) Do not reset if shared->listener_registered. Maybe we should rename
that member actually, as now it means something like "The device is
blank and ready to be configured". Or maybe dedicate two variables or
flags, is a shame to lose the precision of "listener_registered".
2) Implement the VHOST_BACKEND_F_IOTLB_PERSIST part of Si-Wei's series [1].

I'd greatly prefer option 1, as it does not depend on the backend
features and it is more generic. But the option 2 will be needed to
reduce the SVQ transition downtime too.

> But this re-registering is relatively quick compared to how long it
> takes during its initialization sequence.
>

That's interesting, I guess it is because the regions are warm. Can
you measure the time of it so we can evaluate if it is worth comparing
with the iterative migration?

Thanks!

[1] https://lists.nongnu.org/archive/html/qemu-devel/2023-12/msg00909.html

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Jonah Palmer 2 months, 3 weeks ago


On 8/20/25 3:59 AM, Eugenio Perez Martin wrote:
> On Tue, Aug 19, 2025 at 5:11 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>
>>
>>
>> On 8/19/25 3:10 AM, Eugenio Perez Martin wrote:
>>> On Mon, Aug 18, 2025 at 4:46 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>>>
>>>>
>>>>
>>>> On 8/18/25 2:51 AM, Eugenio Perez Martin wrote:
>>>>> On Fri, Aug 15, 2025 at 4:50 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
>>>>>>> On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
>>>>>>>>
>>>>>>>> On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
>>>>>>>>> On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
>>>>>>>>>>
>>>>>>>>>> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
>>>>>>>>>>> This effort was started to reduce the guest visible downtime by
>>>>>>>>>>> virtio-net/vhost-net/vhost-vDPA during live migration, especially
>>>>>>>>>>> vhost-vDPA.
>>>>>>>>>>>
>>>>>>>>>>> The downtime contributed by vhost-vDPA, for example, is not from having to
>>>>>>>>>>> migrate a lot of state but rather expensive backend control-plane latency
>>>>>>>>>>> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
>>>>>>>>>>> settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
>>>>>>>>>>> dominates its downtime.
>>>>>>>>>>>
>>>>>>>>>>> In other words, by migrating the state of virtio-net early (before the
>>>>>>>>>>> stop-and-copy phase), we can also start staging backend configurations,
>>>>>>>>>>> which is the main contributor of downtime when migrating a vhost-vDPA
>>>>>>>>>>> device.
>>>>>>>>>>>
>>>>>>>>>>> I apologize if this series gives the impression that we're migrating a lot
>>>>>>>>>>> of data here. It's more along the lines of moving control-plane latency out
>>>>>>>>>>> of the stop-and-copy phase.
>>>>>>>>>>
>>>>>>>>>> I see, thanks.
>>>>>>>>>>
>>>>>>>>>> Please add these into the cover letter of the next post.  IMHO it's
>>>>>>>>>> extremely important information to explain the real goal of this work.  I
>>>>>>>>>> bet it is not expected for most people when reading the current cover
>>>>>>>>>> letter.
>>>>>>>>>>
>>>>>>>>>> Then it could have nothing to do with iterative phase, am I right?
>>>>>>>>>>
>>>>>>>>>> What are the data needed for the dest QEMU to start staging backend
>>>>>>>>>> configurations to the HWs underneath?  Does dest QEMU already have them in
>>>>>>>>>> the cmdlines?
>>>>>>>>>>
>>>>>>>>>> Asking this because I want to know whether it can be done completely
>>>>>>>>>> without src QEMU at all, e.g. when dest QEMU starts.
>>>>>>>>>>
>>>>>>>>>> If src QEMU's data is still needed, please also first consider providing
>>>>>>>>>> such facility using an "early VMSD" if it is ever possible: feel free to
>>>>>>>>>> refer to commit 3b95a71b22827d26178.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> While it works for this series, it does not allow to resend the state
>>>>>>>>> when the src device changes. For example, if the number of virtqueues
>>>>>>>>> is modified.
>>>>>>>>
>>>>>>>> Some explanation on "how sync number of vqueues helps downtime" would help.
>>>>>>>> Not "it might preheat things", but exactly why, and how that differs when
>>>>>>>> it's pure software, and when hardware will be involved.
>>>>>>>>
>>>>>>>
>>>>>>> By nvidia engineers to configure vqs (number, size, RSS, etc) takes
>>>>>>> about ~200ms:
>>>>>>> https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$
>>>>>>>
>>>>>>> Adding Dragos here in case he can provide more details. Maybe the
>>>>>>> numbers have changed though.
>>>>>>>
>>>>>>> And I guess the difference with pure SW will always come down to PCI
>>>>>>> communications, which assume it is slower than configuring the host SW
>>>>>>> device in RAM or even CPU cache. But I admin that proper profiling is
>>>>>>> needed before making those claims.
>>>>>>>
>>>>>>> Jonah, can you print the time it takes to configure the vDPA device
>>>>>>> with traces vs the time it takes to enable the dataplane of the
>>>>>>> device? So we can get an idea of how much time we save with this.
>>>>>>>
>>>>>>
>>>>>> Let me know if this isn't what you're looking for.
>>>>>>
>>>>>> I'm assuming by "configuration time" you mean:
>>>>>>      - Time from device startup (entry to vhost_vdpa_dev_start()) to right
>>>>>>        before we start enabling the vrings (e.g.
>>>>>>        VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).
>>>>>>
>>>>>> And by "time taken to enable the dataplane" I'm assuming you mean:
>>>>>>      - Time right before we start enabling the vrings (see above) to right
>>>>>>        after we enable the last vring (at the end of
>>>>>>        vhost_vdpa_net_cvq_load())
>>>>>>
>>>>>> Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:
>>>>>>
>>>>>> -netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
>>>>>>             queues=8,x-svq=on
>>>>>>
>>>>>> -device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
>>>>>>             romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
>>>>>>             ctrl_vlan=off,vectors=18,host_mtu=9000,
>>>>>>             disable-legacy=on,disable-modern=off
>>>>>>
>>>>>> ---
>>>>>>
>>>>>> Configuration time:    ~31s
>>>>>> Dataplane enable time: ~0.14ms
>>>>>>
>>>>>
>>>>> I was vague, but yes, that's representative enough! It would be more
>>>>> accurate if the configuration time ends by the time QEMU enables the
>>>>> first queue of the dataplane though.
>>>>>
>>>>> As Si-Wei mentions, is v->shared->listener_registered == true at the
>>>>> beginning of vhost_vdpa_dev_start?
>>>>>
>>>>
>>>> Ah, I also realized that Qemu I was using for measurements was using a
>>>> version before the listener_registered member was introduced.
>>>>
>>>> I retested with the latest changes in Qemu and set x-svq=off, e.g.:
>>>> guest specs: 128G Mem, SVQ=off, CVQ=on, 8 queue pairs. I ran testing 3
>>>> times for measurements.
>>>>
>>>> v->shared->listener_registered == false at the beginning of
>>>> vhost_vdpa_dev_start().
>>>>
>>>
>>> Let's move out the effect of the mem pinning from the downtime by
>>> registering the listener before the migration. Can you check why is it
>>> not registered at vhost_vdpa_set_owner?
>>>
>>
>> Sorry I was profiling improperly. The listener is registered at
>> vhost_vdpa_set_owner initially and v->shared->listener_registered is set
>> to true, but once we reach the first vhost_vdpa_dev_start call, it shows
>> as false and is re-registered later in the function.
>>
>> Should we always expect listener_registered == true at every
>> vhost_vdpa_dev_start call during startup?
> 
> Yes, that leaves all the memory pinning time out of the downtime.
> 
>> This is what I traced during
>> startup of a single guest (no migration).
> 
> We can trace the destination's QEMU to be more accurate, but probably
> it makes no difference.
> 
>> Tracepoint is right at the
>> start of the vhost_vdpa_dev_start function:
>>
>> vhost_vdpa_set_owner() - register memory listener
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
> 
> This is surprising. Can you trace how listener_registered goes to 0 again?
> 

When vhost_vdpa_dev_start gets called with started == false, 
vhost_vdpa_suspend is called, which calls vhost_vdpa_reset_device. In 
there is when v->shared->listener_registered = false.

And even by the first vhost_vdpa_dev_start there was another device 
reset after registering the memory listener in vhost_vdpa_set_owner.

>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 0, started = 1
>> ...
>> * VQs are now being enabled *
>>
>> I'm also seeing that when the guest is being shutdown,
>> dev->vhost_ops->vhost_get_vring_base() is failing in
>> do_vhost_virtqueue_stop():
>>
>> ...
>> [  114.718429] systemd-shutdown[1]: Syncing filesystems and block devices.
>> [  114.719255] systemd-shutdown[1]: Powering off.
>> [  114.719916] sd 0:0:0:0: [sda] Synchronizing SCSI cache
>> [  114.724826] ACPI: PM: Preparing to enter system sleep state S5
>> [  114.725593] reboot: Power down
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 2 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 3 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 4 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 5 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 6 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 7 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 8 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 9 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 10 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 11 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 12 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 13 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>> qemu-system-x86_64: vhost VQ 14 ring restore failed: -1: Operation not
>> permitted (1)
>> qemu-system-x86_64: vhost VQ 15 ring restore failed: -1: Operation not
>> permitted (1)
>> vhost_vdpa_dev_start() - v->shared->listener_registered = 1, started = 0
>>
>> However when x-svq=on, I don't see these errors on shutdown.
>>
> 
> SVQ can mask this error as it does not need to forward the ring
> restore message to the device. It can just start with 0 and convert
> indexes.
> 
> Let's focus on listened_registered first :).
> 
>>>> ---
>>>>
>>>> Configuration time: Time from first entry into vhost_vdpa_dev_start() to
>>>> right after Qemu enables the first VQ.
>>>>     - 26.947s, 26.606s, 27.326s
>>>>
>>>> Enable dataplane: Time from right after first VQ is enabled to right
>>>> after the last VQ is enabled.
>>>>     - 0.081ms, 0.081ms, 0.079ms
>>>>
>>>
>>
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Peter Xu 2 months, 4 weeks ago

On Mon, Aug 18, 2025 at 10:46:00AM -0400, Jonah Palmer wrote:
> 
> 
> On 8/18/25 2:51 AM, Eugenio Perez Martin wrote:
> > On Fri, Aug 15, 2025 at 4:50 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
> > > 
> > > 
> > > 
> > > On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
> > > > On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
> > > > > 
> > > > > On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
> > > > > > On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
> > > > > > > 
> > > > > > > On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
> > > > > > > > This effort was started to reduce the guest visible downtime by
> > > > > > > > virtio-net/vhost-net/vhost-vDPA during live migration, especially
> > > > > > > > vhost-vDPA.
> > > > > > > > 
> > > > > > > > The downtime contributed by vhost-vDPA, for example, is not from having to
> > > > > > > > migrate a lot of state but rather expensive backend control-plane latency
> > > > > > > > like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
> > > > > > > > settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
> > > > > > > > dominates its downtime.
> > > > > > > > 
> > > > > > > > In other words, by migrating the state of virtio-net early (before the
> > > > > > > > stop-and-copy phase), we can also start staging backend configurations,
> > > > > > > > which is the main contributor of downtime when migrating a vhost-vDPA
> > > > > > > > device.
> > > > > > > > 
> > > > > > > > I apologize if this series gives the impression that we're migrating a lot
> > > > > > > > of data here. It's more along the lines of moving control-plane latency out
> > > > > > > > of the stop-and-copy phase.
> > > > > > > 
> > > > > > > I see, thanks.
> > > > > > > 
> > > > > > > Please add these into the cover letter of the next post.  IMHO it's
> > > > > > > extremely important information to explain the real goal of this work.  I
> > > > > > > bet it is not expected for most people when reading the current cover
> > > > > > > letter.
> > > > > > > 
> > > > > > > Then it could have nothing to do with iterative phase, am I right?
> > > > > > > 
> > > > > > > What are the data needed for the dest QEMU to start staging backend
> > > > > > > configurations to the HWs underneath?  Does dest QEMU already have them in
> > > > > > > the cmdlines?
> > > > > > > 
> > > > > > > Asking this because I want to know whether it can be done completely
> > > > > > > without src QEMU at all, e.g. when dest QEMU starts.
> > > > > > > 
> > > > > > > If src QEMU's data is still needed, please also first consider providing
> > > > > > > such facility using an "early VMSD" if it is ever possible: feel free to
> > > > > > > refer to commit 3b95a71b22827d26178.
> > > > > > > 
> > > > > > 
> > > > > > While it works for this series, it does not allow to resend the state
> > > > > > when the src device changes. For example, if the number of virtqueues
> > > > > > is modified.
> > > > > 
> > > > > Some explanation on "how sync number of vqueues helps downtime" would help.
> > > > > Not "it might preheat things", but exactly why, and how that differs when
> > > > > it's pure software, and when hardware will be involved.
> > > > > 
> > > > 
> > > > By nvidia engineers to configure vqs (number, size, RSS, etc) takes
> > > > about ~200ms:
> > > > https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$
> > > > 
> > > > Adding Dragos here in case he can provide more details. Maybe the
> > > > numbers have changed though.
> > > > 
> > > > And I guess the difference with pure SW will always come down to PCI
> > > > communications, which assume it is slower than configuring the host SW
> > > > device in RAM or even CPU cache. But I admin that proper profiling is
> > > > needed before making those claims.
> > > > 
> > > > Jonah, can you print the time it takes to configure the vDPA device
> > > > with traces vs the time it takes to enable the dataplane of the
> > > > device? So we can get an idea of how much time we save with this.
> > > > 
> > > 
> > > Let me know if this isn't what you're looking for.
> > > 
> > > I'm assuming by "configuration time" you mean:
> > >    - Time from device startup (entry to vhost_vdpa_dev_start()) to right
> > >      before we start enabling the vrings (e.g.
> > >      VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).
> > > 
> > > And by "time taken to enable the dataplane" I'm assuming you mean:
> > >    - Time right before we start enabling the vrings (see above) to right
> > >      after we enable the last vring (at the end of
> > >      vhost_vdpa_net_cvq_load())
> > > 
> > > Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:
> > > 
> > > -netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
> > >           queues=8,x-svq=on
> > > 
> > > -device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
> > >           romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
> > >           ctrl_vlan=off,vectors=18,host_mtu=9000,
> > >           disable-legacy=on,disable-modern=off
> > > 
> > > ---
> > > 
> > > Configuration time:    ~31s
> > > Dataplane enable time: ~0.14ms
> > > 
> > 
> > I was vague, but yes, that's representative enough! It would be more
> > accurate if the configuration time ends by the time QEMU enables the
> > first queue of the dataplane though.
> > 
> > As Si-Wei mentions, is v->shared->listener_registered == true at the
> > beginning of vhost_vdpa_dev_start?
> > 
> 
> Ah, I also realized that Qemu I was using for measurements was using a
> version before the listener_registered member was introduced.
> 
> I retested with the latest changes in Qemu and set x-svq=off, e.g.: guest
> specs: 128G Mem, SVQ=off, CVQ=on, 8 queue pairs. I ran testing 3 times for
> measurements.
> 
> v->shared->listener_registered == false at the beginning of
> vhost_vdpa_dev_start().
> 
> ---
> 
> Configuration time: Time from first entry into vhost_vdpa_dev_start() to
> right after Qemu enables the first VQ.
>  - 26.947s, 26.606s, 27.326s

It's surprising to know it takes 20+ seconds for one device to load.

Sorry I'm not familiar with CVQ, please bare with me on my ignorance: how
much CVQ=on contributes to this?  Is page pinning involved here?  Is 128GB
using small pages only?

It looks to me there can still be many things that vDPA will face similar
challenges that VFIO already had.  For example, there's current work
optimizing pinning for VFIO here:

https://lore.kernel.org/all/20250814064714.56485-1-lizhe.67@bytedance.com/

For the long term, not sure if (for either VFIO or vDPA, or similar devices
that needs guest pinning) it would make more sense to start using 1G huge
pages just for the sake of fast pinning.

PFNMAP in VFIO already works with 1G pfnmaps with commit eb996eec783c.
Logically if we could use 1G pages (e.g. on x86_64) for guest, then pinning
/ unpinning can also be easily batched, and DMA pinning should be much
faster.  The same logic may also apply to vDPA if it works the similar way.

The work above was still generic, but I mentioned the idea of optimizing
for 1G huge pages here:

https://lore.kernel.org/all/aC3z_gUxJbY1_JP7@x1.local/#t

Above is just FYI.. definitely not an request to work on that.  So if we
can better split the issue into smaller but multiple scope of works it
would be nicer.  The "iterable migratable virtio-net" might just hide too
many things under the hood.

> 
> Enable dataplane: Time from right after first VQ is enabled to right after
> the last VQ is enabled.
>  - 0.081ms, 0.081ms, 0.079ms
> 

The other thing that might worth mention.. from migration perspective, VFIO
used to introduce one feature called switchover-ack:

# @switchover-ack: If enabled, migration will not stop the source VM
#     and complete the migration until an ACK is received from the
#     destination that it's OK to do so.  Exactly when this ACK is
#     sent depends on the migrated devices that use this feature.  For
#     example, a device can use it to make sure some of its data is
#     sent and loaded in the destination before doing switchover.
#     This can reduce downtime if devices that support this capability
#     are present.  'return-path' capability must be enabled to use
#     it.  (since 8.1)

If above 20+ seconds are not avoidable, not sure if virtio-net would like
to opt-in in this feature too, so that switchover won't happen too soon
during an pre-mature preheat, so that won't be accounted into downtime.

Again, just FYI. I'm not sure if it's applicable.

Thanks,

-- 
Peter Xu

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Eugenio Perez Martin 2 months, 4 weeks ago

On Mon, Aug 18, 2025 at 6:21 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Mon, Aug 18, 2025 at 10:46:00AM -0400, Jonah Palmer wrote:
> >
> >
> > On 8/18/25 2:51 AM, Eugenio Perez Martin wrote:
> > > On Fri, Aug 15, 2025 at 4:50 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
> > > >
> > > >
> > > >
> > > > On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
> > > > > On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
> > > > > >
> > > > > > On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
> > > > > > > On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
> > > > > > > > > This effort was started to reduce the guest visible downtime by
> > > > > > > > > virtio-net/vhost-net/vhost-vDPA during live migration, especially
> > > > > > > > > vhost-vDPA.
> > > > > > > > >
> > > > > > > > > The downtime contributed by vhost-vDPA, for example, is not from having to
> > > > > > > > > migrate a lot of state but rather expensive backend control-plane latency
> > > > > > > > > like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
> > > > > > > > > settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
> > > > > > > > > dominates its downtime.
> > > > > > > > >
> > > > > > > > > In other words, by migrating the state of virtio-net early (before the
> > > > > > > > > stop-and-copy phase), we can also start staging backend configurations,
> > > > > > > > > which is the main contributor of downtime when migrating a vhost-vDPA
> > > > > > > > > device.
> > > > > > > > >
> > > > > > > > > I apologize if this series gives the impression that we're migrating a lot
> > > > > > > > > of data here. It's more along the lines of moving control-plane latency out
> > > > > > > > > of the stop-and-copy phase.
> > > > > > > >
> > > > > > > > I see, thanks.
> > > > > > > >
> > > > > > > > Please add these into the cover letter of the next post.  IMHO it's
> > > > > > > > extremely important information to explain the real goal of this work.  I
> > > > > > > > bet it is not expected for most people when reading the current cover
> > > > > > > > letter.
> > > > > > > >
> > > > > > > > Then it could have nothing to do with iterative phase, am I right?
> > > > > > > >
> > > > > > > > What are the data needed for the dest QEMU to start staging backend
> > > > > > > > configurations to the HWs underneath?  Does dest QEMU already have them in
> > > > > > > > the cmdlines?
> > > > > > > >
> > > > > > > > Asking this because I want to know whether it can be done completely
> > > > > > > > without src QEMU at all, e.g. when dest QEMU starts.
> > > > > > > >
> > > > > > > > If src QEMU's data is still needed, please also first consider providing
> > > > > > > > such facility using an "early VMSD" if it is ever possible: feel free to
> > > > > > > > refer to commit 3b95a71b22827d26178.
> > > > > > > >
> > > > > > >
> > > > > > > While it works for this series, it does not allow to resend the state
> > > > > > > when the src device changes. For example, if the number of virtqueues
> > > > > > > is modified.
> > > > > >
> > > > > > Some explanation on "how sync number of vqueues helps downtime" would help.
> > > > > > Not "it might preheat things", but exactly why, and how that differs when
> > > > > > it's pure software, and when hardware will be involved.
> > > > > >
> > > > >
> > > > > By nvidia engineers to configure vqs (number, size, RSS, etc) takes
> > > > > about ~200ms:
> > > > > https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$
> > > > >
> > > > > Adding Dragos here in case he can provide more details. Maybe the
> > > > > numbers have changed though.
> > > > >
> > > > > And I guess the difference with pure SW will always come down to PCI
> > > > > communications, which assume it is slower than configuring the host SW
> > > > > device in RAM or even CPU cache. But I admin that proper profiling is
> > > > > needed before making those claims.
> > > > >
> > > > > Jonah, can you print the time it takes to configure the vDPA device
> > > > > with traces vs the time it takes to enable the dataplane of the
> > > > > device? So we can get an idea of how much time we save with this.
> > > > >
> > > >
> > > > Let me know if this isn't what you're looking for.
> > > >
> > > > I'm assuming by "configuration time" you mean:
> > > >    - Time from device startup (entry to vhost_vdpa_dev_start()) to right
> > > >      before we start enabling the vrings (e.g.
> > > >      VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).
> > > >
> > > > And by "time taken to enable the dataplane" I'm assuming you mean:
> > > >    - Time right before we start enabling the vrings (see above) to right
> > > >      after we enable the last vring (at the end of
> > > >      vhost_vdpa_net_cvq_load())
> > > >
> > > > Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:
> > > >
> > > > -netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
> > > >           queues=8,x-svq=on
> > > >
> > > > -device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
> > > >           romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
> > > >           ctrl_vlan=off,vectors=18,host_mtu=9000,
> > > >           disable-legacy=on,disable-modern=off
> > > >
> > > > ---
> > > >
> > > > Configuration time:    ~31s
> > > > Dataplane enable time: ~0.14ms
> > > >
> > >
> > > I was vague, but yes, that's representative enough! It would be more
> > > accurate if the configuration time ends by the time QEMU enables the
> > > first queue of the dataplane though.
> > >
> > > As Si-Wei mentions, is v->shared->listener_registered == true at the
> > > beginning of vhost_vdpa_dev_start?
> > >
> >
> > Ah, I also realized that Qemu I was using for measurements was using a
> > version before the listener_registered member was introduced.
> >
> > I retested with the latest changes in Qemu and set x-svq=off, e.g.: guest
> > specs: 128G Mem, SVQ=off, CVQ=on, 8 queue pairs. I ran testing 3 times for
> > measurements.
> >
> > v->shared->listener_registered == false at the beginning of
> > vhost_vdpa_dev_start().
> >
> > ---
> >
> > Configuration time: Time from first entry into vhost_vdpa_dev_start() to
> > right after Qemu enables the first VQ.
> >  - 26.947s, 26.606s, 27.326s
>
> It's surprising to know it takes 20+ seconds for one device to load.
>
> Sorry I'm not familiar with CVQ, please bare with me on my ignorance: how
> much CVQ=on contributes to this?  Is page pinning involved here?  Is 128GB
> using small pages only?
>

CVQ=on is just enabled so we can enable multiqueue, as the HW device
configuration time seems ~linear with this.

> It looks to me there can still be many things that vDPA will face similar
> challenges that VFIO already had.  For example, there's current work
> optimizing pinning for VFIO here:
>
> https://lore.kernel.org/all/20250814064714.56485-1-lizhe.67@bytedance.com/
>
> For the long term, not sure if (for either VFIO or vDPA, or similar devices
> that needs guest pinning) it would make more sense to start using 1G huge
> pages just for the sake of fast pinning.
>
> PFNMAP in VFIO already works with 1G pfnmaps with commit eb996eec783c.
> Logically if we could use 1G pages (e.g. on x86_64) for guest, then pinning
> / unpinning can also be easily batched, and DMA pinning should be much
> faster.  The same logic may also apply to vDPA if it works the similar way.
>
> The work above was still generic, but I mentioned the idea of optimizing
> for 1G huge pages here:
>
> https://lore.kernel.org/all/aC3z_gUxJbY1_JP7@x1.local/#t
>
> Above is just FYI.. definitely not an request to work on that.  So if we
> can better split the issue into smaller but multiple scope of works it
> would be nicer.

I agree. QEMU master is already able to do the memory pinning before
the downtime, so let's profile that way.

> The "iterable migratable virtio-net" might just hide too
> many things under the hood.
>
> >
> > Enable dataplane: Time from right after first VQ is enabled to right after
> > the last VQ is enabled.
> >  - 0.081ms, 0.081ms, 0.079ms
> >
>
> The other thing that might worth mention.. from migration perspective, VFIO
> used to introduce one feature called switchover-ack:
>
> # @switchover-ack: If enabled, migration will not stop the source VM
> #     and complete the migration until an ACK is received from the
> #     destination that it's OK to do so.  Exactly when this ACK is
> #     sent depends on the migrated devices that use this feature.  For
> #     example, a device can use it to make sure some of its data is
> #     sent and loaded in the destination before doing switchover.
> #     This can reduce downtime if devices that support this capability
> #     are present.  'return-path' capability must be enabled to use
> #     it.  (since 8.1)
>
> If above 20+ seconds are not avoidable, not sure if virtio-net would like
> to opt-in in this feature too, so that switchover won't happen too soon
> during an pre-mature preheat, so that won't be accounted into downtime.
>
> Again, just FYI. I'm not sure if it's applicable.
>

Yes it is, my first versions used it :). As you said, maybe we need to
use it here so it is worth it to not miss it!

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Si-Wei Liu 3 months ago

Hi Jonah,

On 8/15/2025 7:50 AM, Jonah Palmer wrote:
>
>
> On 8/14/25 5:28 AM, Eugenio Perez Martin wrote:
>> On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
>>>
>>> On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
>>>> On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
>>>>>
>>>>> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
>>>>>> This effort was started to reduce the guest visible downtime by
>>>>>> virtio-net/vhost-net/vhost-vDPA during live migration, especially
>>>>>> vhost-vDPA.
>>>>>>
>>>>>> The downtime contributed by vhost-vDPA, for example, is not from 
>>>>>> having to
>>>>>> migrate a lot of state but rather expensive backend control-plane 
>>>>>> latency
>>>>>> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN 
>>>>>> filters, offload
>>>>>> settings, MTU, etc.). Doing this requires kernel/HW NIC 
>>>>>> operations which
>>>>>> dominates its downtime.
>>>>>>
>>>>>> In other words, by migrating the state of virtio-net early 
>>>>>> (before the
>>>>>> stop-and-copy phase), we can also start staging backend 
>>>>>> configurations,
>>>>>> which is the main contributor of downtime when migrating a 
>>>>>> vhost-vDPA
>>>>>> device.
>>>>>>
>>>>>> I apologize if this series gives the impression that we're 
>>>>>> migrating a lot
>>>>>> of data here. It's more along the lines of moving control-plane 
>>>>>> latency out
>>>>>> of the stop-and-copy phase.
>>>>>
>>>>> I see, thanks.
>>>>>
>>>>> Please add these into the cover letter of the next post. IMHO it's
>>>>> extremely important information to explain the real goal of this 
>>>>> work.  I
>>>>> bet it is not expected for most people when reading the current cover
>>>>> letter.
>>>>>
>>>>> Then it could have nothing to do with iterative phase, am I right?
>>>>>
>>>>> What are the data needed for the dest QEMU to start staging backend
>>>>> configurations to the HWs underneath?  Does dest QEMU already have 
>>>>> them in
>>>>> the cmdlines?
>>>>>
>>>>> Asking this because I want to know whether it can be done completely
>>>>> without src QEMU at all, e.g. when dest QEMU starts.
>>>>>
>>>>> If src QEMU's data is still needed, please also first consider 
>>>>> providing
>>>>> such facility using an "early VMSD" if it is ever possible: feel 
>>>>> free to
>>>>> refer to commit 3b95a71b22827d26178.
>>>>>
>>>>
>>>> While it works for this series, it does not allow to resend the state
>>>> when the src device changes. For example, if the number of virtqueues
>>>> is modified.
>>>
>>> Some explanation on "how sync number of vqueues helps downtime" 
>>> would help.
>>> Not "it might preheat things", but exactly why, and how that differs 
>>> when
>>> it's pure software, and when hardware will be involved.
>>>
>>
>> By nvidia engineers to configure vqs (number, size, RSS, etc) takes
>> about ~200ms:
>> https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/__;!!ACWV5N9M2RV99hQ!OQdf7sGaBlbXhcFHX7AC7HgYxvFljgwWlIgJCvMgWwFvPqMrAMbWqf0862zV5shIjaUvlrk54fLTK6uo2pA$ 
>>
>>
>> Adding Dragos here in case he can provide more details. Maybe the
>> numbers have changed though.
>>
>> And I guess the difference with pure SW will always come down to PCI
>> communications, which assume it is slower than configuring the host SW
>> device in RAM or even CPU cache. But I admin that proper profiling is
>> needed before making those claims.
>>
>> Jonah, can you print the time it takes to configure the vDPA device
>> with traces vs the time it takes to enable the dataplane of the
>> device? So we can get an idea of how much time we save with this.
>>
>
> Let me know if this isn't what you're looking for.
>
> I'm assuming by "configuration time" you mean:
>  - Time from device startup (entry to vhost_vdpa_dev_start()) to right
>    before we start enabling the vrings (e.g.
>    VHOST_VDPA_SET_VRING_ENABLE in vhost_vdpa_net_cvq_load()).
>
> And by "time taken to enable the dataplane" I'm assuming you mean:
>  - Time right before we start enabling the vrings (see above) to right
>    after we enable the last vring (at the end of
>    vhost_vdpa_net_cvq_load())
>
> Guest specs: 128G Mem, SVQ=on, CVQ=on, 8 queue pairs:
I guess what Eugenio may want to see is the config with SVQ=off (i.e. 
without x-svq=on in below netdev line). Do you have number for that as 
well? Then since vhost_vdpa_dev_start() it should exclude the time for 
pinning, you could easily profile/measure vq configure time (the CVQ 
commands to configure vq number, size, RSS, etc) vs dataplane 
enablement, same way as you did for SVQ=on.

Regards,
-Siwei

>
> -netdev type=vhost-vdpa,vhostdev=$VHOST_VDPA_0,id=vhost-vdpa0,
>         queues=8,x-svq=on
>
> -device virtio-net-pci,netdev=vhost-vdpa0,id=vdpa0,bootindex=-1,
>         romfile=,page-per-vq=on,mac=$VF1_MAC,ctrl_vq=on,mq=on,
>         ctrl_vlan=off,vectors=18,host_mtu=9000,
>         disable-legacy=on,disable-modern=off
>
> ---
>
> Configuration time:    ~31s
> Dataplane enable time: ~0.14ms
>
>>> If it's only about pre-heat, could dest qemu preheat with max num of
>>> vqueues?  Is it the same cost of downtime when growing num of queues,
>>> v.s. shrinking num of queues?
>>>
>>
>> Well you need to send the vq addresses and properties to preheat
>> these. If the address is invalid, the destination device will
>> interpret the vq address as the avail ring, for example, and will read
>> an invalid avail idx.
>>
>>> For softwares, is it about memory transaction updates due to the 
>>> vqueues?
>>> If so, have we investigated a more generic approach on memory side, 
>>> likely
>>> some form of continuation from Chuang's work I previously mentioned?
>>>
>>
>> This work is very interesting, and most of the downtime was because of
>> memory pinning indeed. Thanks for bringing it up! But the downtime is
>> not caused for the individual vq memory config, but for pinning all
>> the guest's memory for the device to access to it.
>>
>> I think it is worth exploring if it affects the downtime in the case
>> of HW. I don't see any reason to reject that series but lack of
>> reviews, isn't it?
>>
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Peter Xu 3 months ago

On Thu, Aug 14, 2025 at 11:28:24AM +0200, Eugenio Perez Martin wrote:
> Well you need to send the vq addresses and properties to preheat
> these. If the address is invalid, the destination device will
> interpret the vq address as the avail ring, for example, and will read
> an invalid avail idx.

I see now.  But.. isn't vq addresses assigned by the guest driver?  What
happens if one pre-heated the vqs but VM rebooted right before live
migration decides to switchover to dest QEMU?

> 
> > For softwares, is it about memory transaction updates due to the vqueues?
> > If so, have we investigated a more generic approach on memory side, likely
> > some form of continuation from Chuang's work I previously mentioned?
> >
> 
> This work is very interesting, and most of the downtime was because of
> memory pinning indeed. Thanks for bringing it up! But the downtime is
> not caused for the individual vq memory config, but for pinning all
> the guest's memory for the device to access to it.
> 
> I think it is worth exploring if it affects the downtime in the case
> of HW. I don't see any reason to reject that series but lack of
> reviews, isn't it?

Partly yes.. but not fully.

I don't remember many details, but I do remember the series tried to mark
the whole device load to be one memory transaction, which will cause the
guest GPA flatview being obsolete during that period.

The issue should be that some of the special devices will need to access
guest memory during post_load(), hence one transaction wouldn't be enough,
and I didn't remember whether we have captured all the outliers of such, or
any side effects due to a possible obsolete flatview's presence.

In one of the later discussions, Stefan used to mention we could provide a
smaller transaction window and I think that might be something we can also
try.

For example, I think it's worthwhile to try one transaction per virtio-net
device, then all the vqueues will be loaded in one transaction as long as
the load of the virtio-net device doesn't need to access guest memory.

-- 
Peter Xu

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Dragos Tatulea 3 months ago

On Thu, Aug 14, 2025 at 11:28:24AM +0200, Eugenio Perez Martin wrote:
> On Wed, Aug 13, 2025 at 4:06 PM Peter Xu <peterx@redhat.com> wrote:
> >
> > On Wed, Aug 13, 2025 at 11:25:00AM +0200, Eugenio Perez Martin wrote:
> > > On Mon, Aug 11, 2025 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
> > > >
> > > > On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
> > > > > This effort was started to reduce the guest visible downtime by
> > > > > virtio-net/vhost-net/vhost-vDPA during live migration, especially
> > > > > vhost-vDPA.
> > > > >
> > > > > The downtime contributed by vhost-vDPA, for example, is not from having to
> > > > > migrate a lot of state but rather expensive backend control-plane latency
> > > > > like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
> > > > > settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
> > > > > dominates its downtime.
> > > > >
> > > > > In other words, by migrating the state of virtio-net early (before the
> > > > > stop-and-copy phase), we can also start staging backend configurations,
> > > > > which is the main contributor of downtime when migrating a vhost-vDPA
> > > > > device.
> > > > >
> > > > > I apologize if this series gives the impression that we're migrating a lot
> > > > > of data here. It's more along the lines of moving control-plane latency out
> > > > > of the stop-and-copy phase.
> > > >
> > > > I see, thanks.
> > > >
> > > > Please add these into the cover letter of the next post.  IMHO it's
> > > > extremely important information to explain the real goal of this work.  I
> > > > bet it is not expected for most people when reading the current cover
> > > > letter.
> > > >
> > > > Then it could have nothing to do with iterative phase, am I right?
> > > >
> > > > What are the data needed for the dest QEMU to start staging backend
> > > > configurations to the HWs underneath?  Does dest QEMU already have them in
> > > > the cmdlines?
> > > >
> > > > Asking this because I want to know whether it can be done completely
> > > > without src QEMU at all, e.g. when dest QEMU starts.
> > > >
> > > > If src QEMU's data is still needed, please also first consider providing
> > > > such facility using an "early VMSD" if it is ever possible: feel free to
> > > > refer to commit 3b95a71b22827d26178.
> > > >
> > >
> > > While it works for this series, it does not allow to resend the state
> > > when the src device changes. For example, if the number of virtqueues
> > > is modified.
> >
> > Some explanation on "how sync number of vqueues helps downtime" would help.
> > Not "it might preheat things", but exactly why, and how that differs when
> > it's pure software, and when hardware will be involved.
> >
> 
> By nvidia engineers to configure vqs (number, size, RSS, etc) takes
> about ~200ms:
> https://lore.kernel.org/qemu-devel/6c8ebb97-d546-3f1c-4cdd-54e23a566f61@nvidia.com/T/
> 
> Adding Dragos here in case he can provide more details. Maybe the
> numbers have changed though.
For kernel mlx5_vdpa it can be even more on larger systems (256 GB VM
with 32 VQs):
https://lore.kernel.org/virtualization/20240830105838.2666587-2-dtatulea@nvidia.com/

As pointed in the above link, configuring VQs can amount to a lot of
time whem many VQs are used (32 in our example). So having them
pre-configured during migration would be a worthwhile optimization.

Thanks,
Dragos

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Jonah Palmer 3 months ago


On 8/11/25 5:55 PM, Peter Xu wrote:
> On Mon, Aug 11, 2025 at 05:26:05PM -0400, Jonah Palmer wrote:
>> This effort was started to reduce the guest visible downtime by
>> virtio-net/vhost-net/vhost-vDPA during live migration, especially
>> vhost-vDPA.
>>
>> The downtime contributed by vhost-vDPA, for example, is not from having to
>> migrate a lot of state but rather expensive backend control-plane latency
>> like CVQ configurations (e.g. MQ queue pairs, RSS, MAC/VLAN filters, offload
>> settings, MTU, etc.). Doing this requires kernel/HW NIC operations which
>> dominates its downtime.
>>
>> In other words, by migrating the state of virtio-net early (before the
>> stop-and-copy phase), we can also start staging backend configurations,
>> which is the main contributor of downtime when migrating a vhost-vDPA
>> device.
>>
>> I apologize if this series gives the impression that we're migrating a lot
>> of data here. It's more along the lines of moving control-plane latency out
>> of the stop-and-copy phase.
> 
> I see, thanks.
> 
> Please add these into the cover letter of the next post.  IMHO it's
> extremely important information to explain the real goal of this work.  I
> bet it is not expected for most people when reading the current cover
> letter.
> 
> Then it could have nothing to do with iterative phase, am I right?
> 
> What are the data needed for the dest QEMU to start staging backend
> configurations to the HWs underneath?  Does dest QEMU already have them in
> the cmdlines?
> 
> Asking this because I want to know whether it can be done completely
> without src QEMU at all, e.g. when dest QEMU starts.
> 
> If src QEMU's data is still needed, please also first consider providing
> such facility using an "early VMSD" if it is ever possible: feel free to
> refer to commit 3b95a71b22827d26178.
> 
> So the data to be transferred is still in VMSD form, aka, data are still
> described by VMSD macros, instead of hard-coded streamline protocols using
> e.g. qemufile APIs using save_setup()/load_setup().
> 
> When things are described in VMSDs, it get the most benefit from the live
> migration framework, and it's much, much more flexible.  It's the most
> suggested way for device to cooperate with live migration, savevmhandlers
> are only the last resort because it's almost not in control of migration..
> 
> In short, please avoid using savevmhandlers as long as there can be any
> other way to achieve similar results.
> 

Oh this early VMSD is interesting and, at first glance, appears to be 
suitable for what we're trying to do here. I'll take a look at it and 
see if this is something we can use instead of the SaveVMHandlers hooks.

Thank you for mentioning this.

Jonah

> Thanks,
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Eugenio Perez Martin 3 months, 2 weeks ago

On Tue, Jul 22, 2025 at 2:41 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>
> Iterative live migration for virtio-net sends an initial
> VMStateDescription while the source is still active. Because data
> continues to flow for virtio-net, the guest's avail index continues to
> increment after last_avail_idx had already been sent. This causes the
> destination to often see something like this from virtio_error():
>
> VQ 0 size 0x100 Guest index 0x0 inconsistent with Host index 0xc: delta 0xfff4
>
> This patch suppresses this consistency check if we're loading the
> initial VMStateDescriptions via iterative migration and unsuppresses
> it for the stop-and-copy phase when the final VMStateDescriptions
> (carrying the correct indices) are loaded.
>
> A temporary VirtIODevMigration migration data structure is introduced here to
> represent the iterative migration process for a VirtIODevice. For now it
> just holds a flag to indicate whether or not the initial
> VMStateDescription was sent during the iterative live migration process.
>
> Signed-off-by: Jonah Palmer <jonah.palmer@oracle.com>
> ---
>  hw/net/virtio-net.c        | 13 +++++++++++++
>  hw/virtio/virtio.c         | 32 ++++++++++++++++++++++++--------
>  include/hw/virtio/virtio.h |  6 ++++++
>  3 files changed, 43 insertions(+), 8 deletions(-)
>
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index 86a6fe5b91..b7ac5e8278 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -3843,12 +3843,19 @@ static void virtio_net_save_cleanup(void *opaque)
>
>  static int virtio_net_load_setup(QEMUFile *f, void *opaque, Error **errp)
>  {
> +    VirtIONet *n = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(n);
> +    vdev->migration = g_new0(VirtIODevMigration, 1);
> +    vdev->migration->iterative_vmstate_loaded = false;
> +
>      return 0;
>  }
>
>  static int virtio_net_load_state(QEMUFile *f, void *opaque, int version_id)
>  {
>      VirtIONet *n = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(n);
> +    VirtIODevMigration *mig = vdev->migration;
>      uint64_t flag;
>
>      flag = qemu_get_be64(f);
> @@ -3861,6 +3868,7 @@ static int virtio_net_load_state(QEMUFile *f, void *opaque, int version_id)
>          case VNET_MIG_F_INIT_STATE:
>          {
>              vmstate_load_state(f, &vmstate_virtio_net, n, VIRTIO_NET_VM_VERSION);
> +            mig->iterative_vmstate_loaded = true;

This code will need to change if we send the status iteratively more
than once. For example, if the guest changes the mac address, the
number of vqs, etc.

In my opinion, we should set a flag named "in_iterative_migration" (or
equivalent) in virtio_net_load_setup and clear it in
virtio_net_load_cleanup. That's enough to tell in virtio_load if we
should perform actions like checking for inconsistent indices.

>              break;
>          }
>          default:
> @@ -3875,6 +3883,11 @@ static int virtio_net_load_state(QEMUFile *f, void *opaque, int version_id)
>
>  static int virtio_net_load_cleanup(void *opaque)
>  {
> +    VirtIONet *n = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(n);
> +    g_free(vdev->migration);
> +    vdev->migration = NULL;
> +
>      return 0;
>  }
>
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 5534251e01..68957ee7d1 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -3222,6 +3222,7 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
>      int32_t config_len;
>      uint32_t num;
>      uint32_t features;
> +    bool inconsistent_indices;
>      BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
>      VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
>      VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
> @@ -3365,6 +3366,16 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
>          if (vdev->vq[i].vring.desc) {
>              uint16_t nheads;
>
> +           /*
> +            * Ring indices will be inconsistent during iterative migration. The actual
> +            * indices will be sent later during the stop-and-copy phase.
> +            */
> +            if (vdev->migration) {
> +                inconsistent_indices = !vdev->migration->iterative_vmstate_loaded;
> +            } else {
> +                inconsistent_indices = false;
> +            }

Nit, "inconsistent_indices = vdev->migration &&
!vdev->migration->iterative_vmstate_loaded" ? I'm happy with the
current "if else" too, but I think the one line is clearer. Your call
:).

> +
>              /*
>               * VIRTIO-1 devices migrate desc, used, and avail ring addresses so
>               * only the region cache needs to be set up.  Legacy devices need
> @@ -3384,14 +3395,19 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
>                  continue;
>              }
>
> -            nheads = vring_avail_idx(&vdev->vq[i]) - vdev->vq[i].last_avail_idx;
> -            /* Check it isn't doing strange things with descriptor numbers. */
> -            if (nheads > vdev->vq[i].vring.num) {
> -                virtio_error(vdev, "VQ %d size 0x%x Guest index 0x%x "
> -                             "inconsistent with Host index 0x%x: delta 0x%x",
> -                             i, vdev->vq[i].vring.num,
> -                             vring_avail_idx(&vdev->vq[i]),
> -                             vdev->vq[i].last_avail_idx, nheads);
> +            if (!inconsistent_indices) {
> +                nheads = vring_avail_idx(&vdev->vq[i]) - vdev->vq[i].last_avail_idx;
> +                /* Check it isn't doing strange things with descriptor numbers. */
> +                if (nheads > vdev->vq[i].vring.num) {
> +                    virtio_error(vdev, "VQ %d size 0x%x Guest index 0x%x "
> +                                 "inconsistent with Host index 0x%x: delta 0x%x",
> +                                 i, vdev->vq[i].vring.num,
> +                                 vring_avail_idx(&vdev->vq[i]),
> +                                 vdev->vq[i].last_avail_idx, nheads);
> +                    inconsistent_indices = true;
> +                }
> +            }
> +            if (inconsistent_indices) {
>                  vdev->vq[i].used_idx = 0;
>                  vdev->vq[i].shadow_avail_idx = 0;
>                  vdev->vq[i].inuse = 0;
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index 214d4a77e9..06b6e6ba65 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -98,6 +98,11 @@ enum virtio_device_endian {
>      VIRTIO_DEVICE_ENDIAN_BIG,
>  };
>
> +/* VirtIODevice iterative live migration data structure */
> +typedef struct VirtIODevMigration {
> +    bool iterative_vmstate_loaded;
> +} VirtIODevMigration;
> +
>  /**
>   * struct VirtIODevice - common VirtIO structure
>   * @name: name of the device
> @@ -151,6 +156,7 @@ struct VirtIODevice
>      bool disable_legacy_check;
>      bool vhost_started;
>      VMChangeStateEntry *vmstate;
> +    VirtIODevMigration *migration;
>      char *bus_name;
>      uint8_t device_endian;
>      /**
> --
> 2.47.1
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Jonah Palmer 3 months, 2 weeks ago


On 7/28/25 11:30 AM, Eugenio Perez Martin wrote:
> On Tue, Jul 22, 2025 at 2:41 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>>
>> Iterative live migration for virtio-net sends an initial
>> VMStateDescription while the source is still active. Because data
>> continues to flow for virtio-net, the guest's avail index continues to
>> increment after last_avail_idx had already been sent. This causes the
>> destination to often see something like this from virtio_error():
>>
>> VQ 0 size 0x100 Guest index 0x0 inconsistent with Host index 0xc: delta 0xfff4
>>
>> This patch suppresses this consistency check if we're loading the
>> initial VMStateDescriptions via iterative migration and unsuppresses
>> it for the stop-and-copy phase when the final VMStateDescriptions
>> (carrying the correct indices) are loaded.
>>
>> A temporary VirtIODevMigration migration data structure is introduced here to
>> represent the iterative migration process for a VirtIODevice. For now it
>> just holds a flag to indicate whether or not the initial
>> VMStateDescription was sent during the iterative live migration process.
>>
>> Signed-off-by: Jonah Palmer <jonah.palmer@oracle.com>
>> ---
>>   hw/net/virtio-net.c        | 13 +++++++++++++
>>   hw/virtio/virtio.c         | 32 ++++++++++++++++++++++++--------
>>   include/hw/virtio/virtio.h |  6 ++++++
>>   3 files changed, 43 insertions(+), 8 deletions(-)
>>
>> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
>> index 86a6fe5b91..b7ac5e8278 100644
>> --- a/hw/net/virtio-net.c
>> +++ b/hw/net/virtio-net.c
>> @@ -3843,12 +3843,19 @@ static void virtio_net_save_cleanup(void *opaque)
>>
>>   static int virtio_net_load_setup(QEMUFile *f, void *opaque, Error **errp)
>>   {
>> +    VirtIONet *n = opaque;
>> +    VirtIODevice *vdev = VIRTIO_DEVICE(n);
>> +    vdev->migration = g_new0(VirtIODevMigration, 1);
>> +    vdev->migration->iterative_vmstate_loaded = false;
>> +
>>       return 0;
>>   }
>>
>>   static int virtio_net_load_state(QEMUFile *f, void *opaque, int version_id)
>>   {
>>       VirtIONet *n = opaque;
>> +    VirtIODevice *vdev = VIRTIO_DEVICE(n);
>> +    VirtIODevMigration *mig = vdev->migration;
>>       uint64_t flag;
>>
>>       flag = qemu_get_be64(f);
>> @@ -3861,6 +3868,7 @@ static int virtio_net_load_state(QEMUFile *f, void *opaque, int version_id)
>>           case VNET_MIG_F_INIT_STATE:
>>           {
>>               vmstate_load_state(f, &vmstate_virtio_net, n, VIRTIO_NET_VM_VERSION);
>> +            mig->iterative_vmstate_loaded = true;
> 
> This code will need to change if we send the status iteratively more
> than once. For example, if the guest changes the mac address, the
> number of vqs, etc.
> 

Hopefully we can reach a solution where we'd only need to call the full 
vmstate_load_state(f, &vmstate_virtio_net, ...) for a virtio-net device 
once and then handle any changes afterwards individually.

Perhaps, maybe for simplicity, we could just send the 
sub-states/subsections (instead of the whole state again) iteratively if 
there were any changes in the fields that those sub-states/subsections 
govern.

Definitely something I'll keep in mind as this series develops.

> In my opinion, we should set a flag named "in_iterative_migration" (or
> equivalent) in virtio_net_load_setup and clear it in
> virtio_net_load_cleanup. That's enough to tell in virtio_load if we
> should perform actions like checking for inconsistent indices.
> 

I did actually try something like this but I realized that the 
.load_cleanup and .save_cleanup hooks actually fire at the very end of 
live migration (e.g. during the stop-and-copy phase). I thought they 
fired at the end of the iterative portion of live migration, but this 
didn't appear to be the case.

>>               break;
>>           }
>>           default:
>> @@ -3875,6 +3883,11 @@ static int virtio_net_load_state(QEMUFile *f, void *opaque, int version_id)
>>
>>   static int virtio_net_load_cleanup(void *opaque)
>>   {
>> +    VirtIONet *n = opaque;
>> +    VirtIODevice *vdev = VIRTIO_DEVICE(n);
>> +    g_free(vdev->migration);
>> +    vdev->migration = NULL;
>> +
>>       return 0;
>>   }
>>
>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>> index 5534251e01..68957ee7d1 100644
>> --- a/hw/virtio/virtio.c
>> +++ b/hw/virtio/virtio.c
>> @@ -3222,6 +3222,7 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
>>       int32_t config_len;
>>       uint32_t num;
>>       uint32_t features;
>> +    bool inconsistent_indices;
>>       BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
>>       VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
>>       VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
>> @@ -3365,6 +3366,16 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
>>           if (vdev->vq[i].vring.desc) {
>>               uint16_t nheads;
>>
>> +           /*
>> +            * Ring indices will be inconsistent during iterative migration. The actual
>> +            * indices will be sent later during the stop-and-copy phase.
>> +            */
>> +            if (vdev->migration) {
>> +                inconsistent_indices = !vdev->migration->iterative_vmstate_loaded;
>> +            } else {
>> +                inconsistent_indices = false;
>> +            }
> 
> Nit, "inconsistent_indices = vdev->migration &&
> !vdev->migration->iterative_vmstate_loaded" ? I'm happy with the
> current "if else" too, but I think the one line is clearer. Your call
> :).
> 

Ah, nice catch! I like the one-liner more :) Will change this for next 
series.

>> +
>>               /*
>>                * VIRTIO-1 devices migrate desc, used, and avail ring addresses so
>>                * only the region cache needs to be set up.  Legacy devices need
>> @@ -3384,14 +3395,19 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
>>                   continue;
>>               }
>>
>> -            nheads = vring_avail_idx(&vdev->vq[i]) - vdev->vq[i].last_avail_idx;
>> -            /* Check it isn't doing strange things with descriptor numbers. */
>> -            if (nheads > vdev->vq[i].vring.num) {
>> -                virtio_error(vdev, "VQ %d size 0x%x Guest index 0x%x "
>> -                             "inconsistent with Host index 0x%x: delta 0x%x",
>> -                             i, vdev->vq[i].vring.num,
>> -                             vring_avail_idx(&vdev->vq[i]),
>> -                             vdev->vq[i].last_avail_idx, nheads);
>> +            if (!inconsistent_indices) {
>> +                nheads = vring_avail_idx(&vdev->vq[i]) - vdev->vq[i].last_avail_idx;
>> +                /* Check it isn't doing strange things with descriptor numbers. */
>> +                if (nheads > vdev->vq[i].vring.num) {
>> +                    virtio_error(vdev, "VQ %d size 0x%x Guest index 0x%x "
>> +                                 "inconsistent with Host index 0x%x: delta 0x%x",
>> +                                 i, vdev->vq[i].vring.num,
>> +                                 vring_avail_idx(&vdev->vq[i]),
>> +                                 vdev->vq[i].last_avail_idx, nheads);
>> +                    inconsistent_indices = true;
>> +                }
>> +            }
>> +            if (inconsistent_indices) {
>>                   vdev->vq[i].used_idx = 0;
>>                   vdev->vq[i].shadow_avail_idx = 0;
>>                   vdev->vq[i].inuse = 0;
>> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
>> index 214d4a77e9..06b6e6ba65 100644
>> --- a/include/hw/virtio/virtio.h
>> +++ b/include/hw/virtio/virtio.h
>> @@ -98,6 +98,11 @@ enum virtio_device_endian {
>>       VIRTIO_DEVICE_ENDIAN_BIG,
>>   };
>>
>> +/* VirtIODevice iterative live migration data structure */
>> +typedef struct VirtIODevMigration {
>> +    bool iterative_vmstate_loaded;
>> +} VirtIODevMigration;
>> +
>>   /**
>>    * struct VirtIODevice - common VirtIO structure
>>    * @name: name of the device
>> @@ -151,6 +156,7 @@ struct VirtIODevice
>>       bool disable_legacy_check;
>>       bool vhost_started;
>>       VMChangeStateEntry *vmstate;
>> +    VirtIODevMigration *migration;
>>       char *bus_name;
>>       uint8_t device_endian;
>>       /**
>> --
>> 2.47.1
>>
>

Re: [RFC 5/6] virtio,virtio-net: skip consistency check in virtio_load for iterative migration

Posted by Eugenio Perez Martin 3 months, 2 weeks ago

On Mon, Jul 28, 2025 at 6:24 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>
>
>
> On 7/28/25 11:30 AM, Eugenio Perez Martin wrote:
> > On Tue, Jul 22, 2025 at 2:41 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
> >>
> >> Iterative live migration for virtio-net sends an initial
> >> VMStateDescription while the source is still active. Because data
> >> continues to flow for virtio-net, the guest's avail index continues to
> >> increment after last_avail_idx had already been sent. This causes the
> >> destination to often see something like this from virtio_error():
> >>
> >> VQ 0 size 0x100 Guest index 0x0 inconsistent with Host index 0xc: delta 0xfff4
> >>
> >> This patch suppresses this consistency check if we're loading the
> >> initial VMStateDescriptions via iterative migration and unsuppresses
> >> it for the stop-and-copy phase when the final VMStateDescriptions
> >> (carrying the correct indices) are loaded.
> >>
> >> A temporary VirtIODevMigration migration data structure is introduced here to
> >> represent the iterative migration process for a VirtIODevice. For now it
> >> just holds a flag to indicate whether or not the initial
> >> VMStateDescription was sent during the iterative live migration process.
> >>
> >> Signed-off-by: Jonah Palmer <jonah.palmer@oracle.com>
> >> ---
> >>   hw/net/virtio-net.c        | 13 +++++++++++++
> >>   hw/virtio/virtio.c         | 32 ++++++++++++++++++++++++--------
> >>   include/hw/virtio/virtio.h |  6 ++++++
> >>   3 files changed, 43 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> >> index 86a6fe5b91..b7ac5e8278 100644
> >> --- a/hw/net/virtio-net.c
> >> +++ b/hw/net/virtio-net.c
> >> @@ -3843,12 +3843,19 @@ static void virtio_net_save_cleanup(void *opaque)
> >>
> >>   static int virtio_net_load_setup(QEMUFile *f, void *opaque, Error **errp)
> >>   {
> >> +    VirtIONet *n = opaque;
> >> +    VirtIODevice *vdev = VIRTIO_DEVICE(n);
> >> +    vdev->migration = g_new0(VirtIODevMigration, 1);
> >> +    vdev->migration->iterative_vmstate_loaded = false;
> >> +
> >>       return 0;
> >>   }
> >>
> >>   static int virtio_net_load_state(QEMUFile *f, void *opaque, int version_id)
> >>   {
> >>       VirtIONet *n = opaque;
> >> +    VirtIODevice *vdev = VIRTIO_DEVICE(n);
> >> +    VirtIODevMigration *mig = vdev->migration;
> >>       uint64_t flag;
> >>
> >>       flag = qemu_get_be64(f);
> >> @@ -3861,6 +3868,7 @@ static int virtio_net_load_state(QEMUFile *f, void *opaque, int version_id)
> >>           case VNET_MIG_F_INIT_STATE:
> >>           {
> >>               vmstate_load_state(f, &vmstate_virtio_net, n, VIRTIO_NET_VM_VERSION);
> >> +            mig->iterative_vmstate_loaded = true;
> >
> > This code will need to change if we send the status iteratively more
> > than once. For example, if the guest changes the mac address, the
> > number of vqs, etc.
> >
>
> Hopefully we can reach a solution where we'd only need to call the full
> vmstate_load_state(f, &vmstate_virtio_net, ...) for a virtio-net device
> once and then handle any changes afterwards individually.
>
> Perhaps, maybe for simplicity, we could just send the
> sub-states/subsections (instead of the whole state again) iteratively if
> there were any changes in the fields that those sub-states/subsections
> govern.
>
> Definitely something I'll keep in mind as this series develops.
>
> > In my opinion, we should set a flag named "in_iterative_migration" (or
> > equivalent) in virtio_net_load_setup and clear it in
> > virtio_net_load_cleanup. That's enough to tell in virtio_load if we
> > should perform actions like checking for inconsistent indices.
> >
>
> I did actually try something like this but I realized that the
> .load_cleanup and .save_cleanup hooks actually fire at the very end of
> live migration (e.g. during the stop-and-copy phase). I thought they
> fired at the end of the iterative portion of live migration, but this
> didn't appear to be the case.
>

Ok that makes a lot of sense. What about .switchover_start ? We need
the switchover capability though, not sure if it is a good idea to
mandate it as a requirement. So yes, maybe this patch is the most
reliable way to do so.


> >>               break;
> >>           }
> >>           default:
> >> @@ -3875,6 +3883,11 @@ static int virtio_net_load_state(QEMUFile *f, void *opaque, int version_id)
> >>
> >>   static int virtio_net_load_cleanup(void *opaque)
> >>   {
> >> +    VirtIONet *n = opaque;
> >> +    VirtIODevice *vdev = VIRTIO_DEVICE(n);
> >> +    g_free(vdev->migration);
> >> +    vdev->migration = NULL;
> >> +
> >>       return 0;
> >>   }
> >>
> >> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> >> index 5534251e01..68957ee7d1 100644
> >> --- a/hw/virtio/virtio.c
> >> +++ b/hw/virtio/virtio.c
> >> @@ -3222,6 +3222,7 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
> >>       int32_t config_len;
> >>       uint32_t num;
> >>       uint32_t features;
> >> +    bool inconsistent_indices;
> >>       BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
> >>       VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
> >>       VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
> >> @@ -3365,6 +3366,16 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
> >>           if (vdev->vq[i].vring.desc) {
> >>               uint16_t nheads;
> >>
> >> +           /*
> >> +            * Ring indices will be inconsistent during iterative migration. The actual
> >> +            * indices will be sent later during the stop-and-copy phase.
> >> +            */
> >> +            if (vdev->migration) {
> >> +                inconsistent_indices = !vdev->migration->iterative_vmstate_loaded;
> >> +            } else {
> >> +                inconsistent_indices = false;
> >> +            }
> >
> > Nit, "inconsistent_indices = vdev->migration &&
> > !vdev->migration->iterative_vmstate_loaded" ? I'm happy with the
> > current "if else" too, but I think the one line is clearer. Your call
> > :).
> >
>
> Ah, nice catch! I like the one-liner more :) Will change this for next
> series.
>
> >> +
> >>               /*
> >>                * VIRTIO-1 devices migrate desc, used, and avail ring addresses so
> >>                * only the region cache needs to be set up.  Legacy devices need
> >> @@ -3384,14 +3395,19 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
> >>                   continue;
> >>               }
> >>
> >> -            nheads = vring_avail_idx(&vdev->vq[i]) - vdev->vq[i].last_avail_idx;
> >> -            /* Check it isn't doing strange things with descriptor numbers. */
> >> -            if (nheads > vdev->vq[i].vring.num) {
> >> -                virtio_error(vdev, "VQ %d size 0x%x Guest index 0x%x "
> >> -                             "inconsistent with Host index 0x%x: delta 0x%x",
> >> -                             i, vdev->vq[i].vring.num,
> >> -                             vring_avail_idx(&vdev->vq[i]),
> >> -                             vdev->vq[i].last_avail_idx, nheads);
> >> +            if (!inconsistent_indices) {
> >> +                nheads = vring_avail_idx(&vdev->vq[i]) - vdev->vq[i].last_avail_idx;
> >> +                /* Check it isn't doing strange things with descriptor numbers. */
> >> +                if (nheads > vdev->vq[i].vring.num) {
> >> +                    virtio_error(vdev, "VQ %d size 0x%x Guest index 0x%x "
> >> +                                 "inconsistent with Host index 0x%x: delta 0x%x",
> >> +                                 i, vdev->vq[i].vring.num,
> >> +                                 vring_avail_idx(&vdev->vq[i]),
> >> +                                 vdev->vq[i].last_avail_idx, nheads);
> >> +                    inconsistent_indices = true;
> >> +                }
> >> +            }
> >> +            if (inconsistent_indices) {
> >>                   vdev->vq[i].used_idx = 0;
> >>                   vdev->vq[i].shadow_avail_idx = 0;
> >>                   vdev->vq[i].inuse = 0;
> >> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> >> index 214d4a77e9..06b6e6ba65 100644
> >> --- a/include/hw/virtio/virtio.h
> >> +++ b/include/hw/virtio/virtio.h
> >> @@ -98,6 +98,11 @@ enum virtio_device_endian {
> >>       VIRTIO_DEVICE_ENDIAN_BIG,
> >>   };
> >>
> >> +/* VirtIODevice iterative live migration data structure */
> >> +typedef struct VirtIODevMigration {
> >> +    bool iterative_vmstate_loaded;
> >> +} VirtIODevMigration;
> >> +
> >>   /**
> >>    * struct VirtIODevice - common VirtIO structure
> >>    * @name: name of the device
> >> @@ -151,6 +156,7 @@ struct VirtIODevice
> >>       bool disable_legacy_check;
> >>       bool vhost_started;
> >>       VMChangeStateEntry *vmstate;
> >> +    VirtIODevMigration *migration;
> >>       char *bus_name;
> >>       uint8_t device_endian;
> >>       /**
> >> --
> >> 2.47.1
> >>
> >
>

[RFC 1/6] migration: Add virtio-iterative capability
[RFC 2/6] virtio-net: Reorder vmstate_virtio_net and helpers
[RFC 3/6] virtio-net: Add SaveVMHandlers for iterative migration
[RFC 4/6] virtio-net: iter live migration - migrate vmstate
[RFC 5/6] virtio, virtio-net: skip consistency check in virtio_load for iterative migration
[RFC 6/6] virtio-net: skip vhost_started assertion during iterative migration