[PATCH 5/5] migration: enable multifd and postcopy together

Prasad Pandit posted 5 patches 3 weeks, 4 days ago
[PATCH 5/5] migration: enable multifd and postcopy together
Posted by Prasad Pandit 3 weeks, 4 days ago
From: Prasad Pandit <pjp@fedoraproject.org>

Enable Multifd and Postcopy migration together.
The migration_ioc_process_incoming() routine
checks magic value sent on each channel and
helps to properly setup multifd and postcopy
channels.

Idea is to take advantage of the multifd threads
to accelerate transfer of large guest RAM to the
destination and switch to postcopy mode sooner.

The Precopy and Multifd threads work during the
initial guest RAM transfer. When migration moves
to the Postcopy phase, the source guest is
paused, so the Precopy and Multifd threads stop
sending data on their channels. Postcopy threads
on the destination request/pull data from the
source side.

Signed-off-by: Prasad Pandit <pjp@fedoraproject.org>
---
 migration/migration.c | 73 ++++++++++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 021faee2f3..11fcc1e012 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -92,6 +92,9 @@ enum mig_rp_message_type {
     MIG_RP_MSG_MAX
 };
 
+/* Migration channel types */
+enum { CH_DEFAULT, CH_MULTIFD, CH_POSTCOPY };
+
 /* When we add fault tolerance, we could have several
    migrations at once.  For now we don't need to add
    dynamic creation of migration */
@@ -919,16 +922,15 @@ void migration_fd_process_incoming(QEMUFile *f)
  * Returns true when we want to start a new incoming migration process,
  * false otherwise.
  */
-static bool migration_should_start_incoming(bool main_channel)
+static bool migration_should_start_incoming(uint8_t channel)
 {
+    if (channel == CH_POSTCOPY) {
+        return false;
+    }
+
     /* Multifd doesn't start unless all channels are established */
     if (migrate_multifd()) {
-        return migration_has_all_channels();
-    }
-
-    /* Preempt channel only starts when the main channel is created */
-    if (migrate_postcopy_preempt()) {
-        return main_channel;
+        return multifd_recv_all_channels_created();
     }
 
     /*
@@ -936,7 +938,7 @@ static bool migration_should_start_incoming(bool main_channel)
      * it's the main channel that's being created, and we should always
      * proceed with this channel.
      */
-    assert(main_channel);
+    assert(channel == CH_DEFAULT);
     return true;
 }
 
@@ -945,13 +947,11 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
     MigrationIncomingState *mis = migration_incoming_get_current();
     Error *local_err = NULL;
     QEMUFile *f;
-    bool default_channel = true;
     uint32_t channel_magic = 0;
+    uint8_t channel = CH_DEFAULT;
     int ret = 0;
 
-    if (migrate_multifd() && !migrate_mapped_ram() &&
-        !migrate_postcopy_ram() &&
-        qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+    if (qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
         /*
          * With multiple channels, it is possible that we receive channels
          * out of order on destination side, causing incorrect mapping of
@@ -969,35 +969,49 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
             return;
         }
 
-        default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
-    } else {
-        default_channel = !mis->from_src_file;
+        if (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC)) {
+            channel = CH_DEFAULT;
+        } else if (channel_magic == cpu_to_be32(MULTIFD_MAGIC)) {
+            channel = CH_MULTIFD;
+        } else if (channel_magic == cpu_to_be32(POSTCOPY_MAGIC)) {
+            if (qio_channel_read_all(ioc, (char *)&channel_magic,
+                                sizeof(channel_magic), &local_err)) {
+                error_report_err(local_err);
+                return;
+            }
+            channel = CH_POSTCOPY;
+        } else {
+            error_report("%s: could not identify channel, unknown magic: %u",
+                           __func__, channel_magic);
+            return;
+        }
     }
 
     if (multifd_recv_setup(errp) != 0) {
         return;
     }
 
-    if (default_channel) {
+    if (channel == CH_DEFAULT) {
         f = qemu_file_new_input(ioc);
         migration_incoming_setup(f);
-    } else {
+    } else if (channel == CH_MULTIFD) {
         /* Multiple connections */
-        assert(migration_needs_multiple_sockets());
         if (migrate_multifd()) {
             multifd_recv_new_channel(ioc, &local_err);
-        } else {
+        }
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    } else if (channel == CH_POSTCOPY) {
+        if (migrate_postcopy()) {
             assert(migrate_postcopy_preempt());
             f = qemu_file_new_input(ioc);
             postcopy_preempt_new_channel(mis, f);
         }
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
-        }
     }
 
-    if (migration_should_start_incoming(default_channel)) {
+    if (migration_should_start_incoming(channel)) {
         /* If it's a recovery, we're done */
         if (postcopy_try_recover()) {
             return;
@@ -1014,21 +1028,22 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
  */
 bool migration_has_all_channels(void)
 {
+    bool ret = false;
     MigrationIncomingState *mis = migration_incoming_get_current();
 
     if (!mis->from_src_file) {
-        return false;
+        return ret;
     }
 
     if (migrate_multifd()) {
-        return multifd_recv_all_channels_created();
+        ret = multifd_recv_all_channels_created();
     }
 
-    if (migrate_postcopy_preempt()) {
-        return mis->postcopy_qemufile_dst != NULL;
+    if (ret && migrate_postcopy_preempt()) {
+        ret = mis->postcopy_qemufile_dst != NULL;
     }
 
-    return true;
+    return ret;
 }
 
 int migrate_send_rp_switchover_ack(MigrationIncomingState *mis)
-- 
2.47.0
Re: [PATCH 5/5] migration: enable multifd and postcopy together
Posted by Peter Xu 2 weeks, 5 days ago
On Tue, Oct 29, 2024 at 08:39:08PM +0530, Prasad Pandit wrote:
> From: Prasad Pandit <pjp@fedoraproject.org>
> 
> Enable Multifd and Postcopy migration together.
> The migration_ioc_process_incoming() routine
> checks magic value sent on each channel and
> helps to properly setup multifd and postcopy
> channels.
> 
> Idea is to take advantage of the multifd threads
> to accelerate transfer of large guest RAM to the
> destination and switch to postcopy mode sooner.
> 
> The Precopy and Multifd threads work during the
> initial guest RAM transfer. When migration moves
> to the Postcopy phase, the source guest is
> paused, so the Precopy and Multifd threads stop
> sending data on their channels. Postcopy threads
> on the destination request/pull data from the
> source side.

Hmm I think this is not the truth..

Precopy keeps sending data even during postcopy, that's the background
stream (with/without preempt feature enabled).  May need some amendment
when repost here.

> 
> Signed-off-by: Prasad Pandit <pjp@fedoraproject.org>
> ---
>  migration/migration.c | 73 ++++++++++++++++++++++++++-----------------
>  1 file changed, 44 insertions(+), 29 deletions(-)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index 021faee2f3..11fcc1e012 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -92,6 +92,9 @@ enum mig_rp_message_type {
>      MIG_RP_MSG_MAX
>  };
>  
> +/* Migration channel types */
> +enum { CH_DEFAULT, CH_MULTIFD, CH_POSTCOPY };
> +
>  /* When we add fault tolerance, we could have several
>     migrations at once.  For now we don't need to add
>     dynamic creation of migration */
> @@ -919,16 +922,15 @@ void migration_fd_process_incoming(QEMUFile *f)
>   * Returns true when we want to start a new incoming migration process,
>   * false otherwise.
>   */
> -static bool migration_should_start_incoming(bool main_channel)
> +static bool migration_should_start_incoming(uint8_t channel)
>  {
> +    if (channel == CH_POSTCOPY) {
> +        return false;
> +    }

Please see my other reply, I think here it should never be POSTCOPY
channel, because postcopy (aka, preempt) channel is only created after the
main channel..

So I wonder whether this "if" will hit at all.

> +
>      /* Multifd doesn't start unless all channels are established */
>      if (migrate_multifd()) {
> -        return migration_has_all_channels();
> -    }
> -
> -    /* Preempt channel only starts when the main channel is created */
> -    if (migrate_postcopy_preempt()) {
> -        return main_channel;
> +        return multifd_recv_all_channels_created();

I think this is incorrect.. We should also need to check main channel is
established before start incoming.  The old code uses
migration_has_all_channels() which checks that.

>      }
>  
>      /*
> @@ -936,7 +938,7 @@ static bool migration_should_start_incoming(bool main_channel)
>       * it's the main channel that's being created, and we should always
>       * proceed with this channel.
>       */
> -    assert(main_channel);
> +    assert(channel == CH_DEFAULT);
>      return true;
>  }
>  
> @@ -945,13 +947,11 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
>      MigrationIncomingState *mis = migration_incoming_get_current();
>      Error *local_err = NULL;
>      QEMUFile *f;
> -    bool default_channel = true;
>      uint32_t channel_magic = 0;
> +    uint8_t channel = CH_DEFAULT;
>      int ret = 0;
>  
> -    if (migrate_multifd() && !migrate_mapped_ram() &&
> -        !migrate_postcopy_ram() &&
> -        qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
> +    if (qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
>          /*
>           * With multiple channels, it is possible that we receive channels
>           * out of order on destination side, causing incorrect mapping of
> @@ -969,35 +969,49 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
>              return;
>          }
>  
> -        default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
> -    } else {
> -        default_channel = !mis->from_src_file;
> +        if (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC)) {
> +            channel = CH_DEFAULT;
> +        } else if (channel_magic == cpu_to_be32(MULTIFD_MAGIC)) {
> +            channel = CH_MULTIFD;
> +        } else if (channel_magic == cpu_to_be32(POSTCOPY_MAGIC)) {
> +            if (qio_channel_read_all(ioc, (char *)&channel_magic,
> +                                sizeof(channel_magic), &local_err)) {
> +                error_report_err(local_err);
> +                return;
> +            }
> +            channel = CH_POSTCOPY;
> +        } else {
> +            error_report("%s: could not identify channel, unknown magic: %u",
> +                           __func__, channel_magic);
> +            return;
> +        }
>      }
>  
>      if (multifd_recv_setup(errp) != 0) {
>          return;
>      }
>  
> -    if (default_channel) {
> +    if (channel == CH_DEFAULT) {
>          f = qemu_file_new_input(ioc);
>          migration_incoming_setup(f);
> -    } else {
> +    } else if (channel == CH_MULTIFD) {
>          /* Multiple connections */
> -        assert(migration_needs_multiple_sockets());
>          if (migrate_multifd()) {
>              multifd_recv_new_channel(ioc, &local_err);
> -        } else {
> +        }
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    } else if (channel == CH_POSTCOPY) {
> +        if (migrate_postcopy()) {
>              assert(migrate_postcopy_preempt());
>              f = qemu_file_new_input(ioc);
>              postcopy_preempt_new_channel(mis, f);
>          }
> -        if (local_err) {
> -            error_propagate(errp, local_err);
> -            return;
> -        }
>      }
>  
> -    if (migration_should_start_incoming(default_channel)) {
> +    if (migration_should_start_incoming(channel)) {
>          /* If it's a recovery, we're done */
>          if (postcopy_try_recover()) {
>              return;
> @@ -1014,21 +1028,22 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
>   */
>  bool migration_has_all_channels(void)
>  {
> +    bool ret = false;
>      MigrationIncomingState *mis = migration_incoming_get_current();
>  
>      if (!mis->from_src_file) {
> -        return false;
> +        return ret;
>      }
>  
>      if (migrate_multifd()) {
> -        return multifd_recv_all_channels_created();
> +        ret = multifd_recv_all_channels_created();
>      }
>  
> -    if (migrate_postcopy_preempt()) {
> -        return mis->postcopy_qemufile_dst != NULL;
> +    if (ret && migrate_postcopy_preempt()) {
> +        ret = mis->postcopy_qemufile_dst != NULL;
>      }

IMHO it's clearer written as:

       if (migrate_multifd()) {
           if (!multifd_recv_all_channels_created()) {
               return false;
           }
       }

       if (migrate_preempt()) {
           if (mis->postcopy_qemufile_dst == NULL) {
               return false;
           }
       }

       return true;

>  
> -    return true;
> +    return ret;
>  }

I don't yet see how you manage the multifd threads, etc, on both sides.  Or
any logic to make sure multifd will properly flush the pages before
postcopy starts.  IOW, any guarantee that all the pages will only be
installed using UFFDIO_COPY as long as vcpu starts on dest.  Any comments?

The most complicated part of this work would be testing, to make sure it
works in all previous cases, and that's majorly why we disabled it before:
it was because it randomly crashes, but not always; fundamentally it's
because when multifd was designed there wasn't enough consideration on
working together with postcopy.  We didn't clearly know what's missing at
that time.

So we would definitely need to add test cases, just like whoever adds new
features to migration, to make sure at least it works for existing multifd
/ postcopy test cases, but when both features enabled.

Some hints on what we can add (assuming we already enable both features):

  - All possible multifd test cases can run one more time with postcopy
    enabled, but when precopy will converge and finish / cancel migration.

    e.g.:

    /x86_64/migration/multifd/file/mapped-ram/*

    These ones need to keep working like before, it should simply ignore
    postcopy being enabled.

    /x86_64/migration/multifd/tcp/uri/plain/none

    This one is the most generic multifd test, we'd want to make this run
    again with postcopy enabled, so it verifies it keeps working if it can
    converge before postcopy enabled.

    /x86_64/migration/multifd/tcp/plain/cancel

    This one tests multifd cancellations, and we want to make sure this
    works too when postcopy is enabled.

  - All possible postcopy test cases can also run one more time with
    multifd enabled.  Exmaples:

    /x86_64/migration/postcopy/plain

    The most generic postcopy test, we want to run this with multifd
    enabled, then this will cover the most simple use case of
    multifd-precopy plus postcopy.

    /x86_64/migration/postcopy/recovery/*

    These tests cover the fundamental postcopy recovery (plan, fails in
    handshake, fails in reconnect, or when using TLS), we may want to make
    sure these work even if multifd cap is enabled.

    /x86_64/migration/postcopy/preempt/*

    Similarly like above, but it now enables preempt channels too.

It will add quite a few tests to run here, but I don't see a good way
otherwise when we want to enable the two features, because it is indeed a
challenge to enable the two major features together here.

You can also do some manual tests with real workloads when working on this
series, that'll be definitely very helpful.  I had a feeling that this
series could already fail some when enable both features, but please give
it a shot.

Thanks,

-- 
Peter Xu
Re: [PATCH 5/5] migration: enable multifd and postcopy together
Posted by Prasad Pandit 2 weeks, 4 days ago
On Mon, 4 Nov 2024 at 23:19, Peter Xu <peterx@redhat.com> wrote:
> Precopy keeps sending data even during postcopy, that's the background
> stream (with/without preempt feature enabled).  May need some amendment
> when repost here.

* Okay.

> > +    if (channel == CH_POSTCOPY) {
> > +        return false;
> > +    }
>
> Please see my other reply, I think here it should never be POSTCOPY
> channel, because postcopy (aka, preempt) channel is only created after the
> main channel.. So I wonder whether this "if" will hit at all.

* It does. migration_ioc_process_incoming() is called for each
incoming connection. And every time it calls
migration_should_start_incoming() to check if it should proceed with
the migration or should wait for further connections, because multifd
does not start until all its connections are established.

* Similarly when the Postcopy channel is initiated towards the end of
Precopy migration, migration_should_start_incoming() gets called, and
returns 'false' because we don't want to start a new incoming
migration at that time. Earlier it was receiving boolean value
'default_channel' as parameter, which was set to false while accepting
'postcopy' connection via => default_channel = !mis->from_src_file;

> > +
> >      /* Multifd doesn't start unless all channels are established */
> >      if (migrate_multifd()) {
> > -        return migration_has_all_channels();
> > -    }
> > -
> > -    /* Preempt channel only starts when the main channel is created */
> > -    if (migrate_postcopy_preempt()) {
> > -        return main_channel;
> > +        return multifd_recv_all_channels_created();
>
> I think this is incorrect.. We should also need to check main channel is
> established before start incoming.  The old code uses
> migration_has_all_channels() which checks that.

* Okay, will try to fix it better.  But calling
migration_has_all_channels() after checking migrate_multifd() does not
seem/read right.

> I don't yet see how you manage the multifd threads, etc, on both sides.  Or
> any logic to make sure multifd will properly flush the pages before
> postcopy starts.  IOW, any guarantee that all the pages will only be
> installed using UFFDIO_COPY as long as vcpu starts on dest.  Any comments?

* There are no changes related to that. As this series only tries to
enable the multifd and postcopy options together.

> The most complicated part of this work would be testing, to make sure it
> works in all previous cases, and that's majorly why we disabled it before:
> it was because it randomly crashes, but not always; fundamentally it's
> because when multifd was designed there wasn't enough consideration on
> working together with postcopy.  We didn't clearly know what's missing at
> that time.

* Right. I have tested this series with the following migration
commands to confirm that migration works as expected and there were no
crash(es) observed.
===
[Precopy]
# virsh migrate --verbose --live --auto-converge f39vm
qemu+ssh://<destination-host-name>/system

[Precopy + Multifd]
# virsh migrate --verbose --live --auto-converge --parallel
--parallel-connections 02 \
     f39vm  qemu+ssh://<destination-host-name>/system
# virsh migrate --verbose --live --auto-converge --parallel
--parallel-connections 04 \
     f39vm  qemu+ssh://<destination-host-name>/system
# virsh migrate --verbose --live --auto-converge --parallel
--parallel-connections 08 \
     f39vm  qemu+ssh://<destination-host-name>/system
# virsh migrate --verbose --live --auto-converge --parallel
--parallel-connections 16 \
     f39vm  qemu+ssh://<destination-host-name>/system

[Postcopy]
# virsh migrate --verbose --live --auto-converge \
    --postcopy --postcopy-after-precopy f39vm
qemu+ssh://<destination-host-name>/system

[Postcopy + Multifd]
# virsh migrate --verbose --live --auto-converge --parallel
--parallel-connections 02 \
    --postcopy --postcopy-after-precopy f39vm
qemu+ssh://<destination-host-name>/system
# virsh migrate --verbose --live --auto-converge --parallel
--parallel-connections 04 \
    --postcopy --postcopy-after-precopy f39vm
qemu+ssh://<destination-host-name>/system
# virsh migrate --verbose --live --auto-converge --parallel
--parallel-connections 08 \
    --postcopy --postcopy-after-precopy f39vm
qemu+ssh://<destination-host-name>/system
# virsh migrate --verbose --live --auto-converge --parallel
--parallel-connections 16 \
    --postcopy --postcopy-after-precopy f39vm
qemu+ssh://<destination-host-name>/system
===

> So we would definitely need to add test cases, just like whoever adds new
> features to migration, to make sure at least it works for existing multifd
> / postcopy test cases, but when both features enabled.
...
> It will add quite a few tests to run here, but I don't see a good way
> otherwise when we want to enable the two features, because it is indeed a
> challenge to enable the two major features together here.
>

* Thank you for the hints about the tests, will surely look into them
and try to learn about how to add new test cases.

Thank you.
---
  - Prasad
Re: [PATCH 5/5] migration: enable multifd and postcopy together
Posted by Peter Xu 2 weeks, 4 days ago
On Tue, Nov 05, 2024 at 05:24:55PM +0530, Prasad Pandit wrote:
> On Mon, 4 Nov 2024 at 23:19, Peter Xu <peterx@redhat.com> wrote:
> > Precopy keeps sending data even during postcopy, that's the background
> > stream (with/without preempt feature enabled).  May need some amendment
> > when repost here.
> 
> * Okay.
> 
> > > +    if (channel == CH_POSTCOPY) {
> > > +        return false;
> > > +    }
> >
> > Please see my other reply, I think here it should never be POSTCOPY
> > channel, because postcopy (aka, preempt) channel is only created after the
> > main channel.. So I wonder whether this "if" will hit at all.
> 
> * It does. migration_ioc_process_incoming() is called for each
> incoming connection. And every time it calls
> migration_should_start_incoming() to check if it should proceed with
> the migration or should wait for further connections, because multifd
> does not start until all its connections are established.
> 
> * Similarly when the Postcopy channel is initiated towards the end of
> Precopy migration, migration_should_start_incoming() gets called, and
> returns 'false' because we don't want to start a new incoming
> migration at that time. Earlier it was receiving boolean value
> 'default_channel' as parameter, which was set to false while accepting
> 'postcopy' connection via => default_channel = !mis->from_src_file;

Hmm yes, it will happen, but should only happen after the main channel.

> 
> > > +
> > >      /* Multifd doesn't start unless all channels are established */
> > >      if (migrate_multifd()) {
> > > -        return migration_has_all_channels();
> > > -    }
> > > -
> > > -    /* Preempt channel only starts when the main channel is created */
> > > -    if (migrate_postcopy_preempt()) {
> > > -        return main_channel;
> > > +        return multifd_recv_all_channels_created();
> >
> > I think this is incorrect.. We should also need to check main channel is
> > established before start incoming.  The old code uses
> > migration_has_all_channels() which checks that.
> 
> * Okay, will try to fix it better.  But calling
> migration_has_all_channels() after checking migrate_multifd() does not
> seem/read right.
> 
> > I don't yet see how you manage the multifd threads, etc, on both sides.  Or
> > any logic to make sure multifd will properly flush the pages before
> > postcopy starts.  IOW, any guarantee that all the pages will only be
> > installed using UFFDIO_COPY as long as vcpu starts on dest.  Any comments?
> 
> * There are no changes related to that. As this series only tries to
> enable the multifd and postcopy options together.

In short, qemu_savevm_state_complete_precopy_iterable() is skipped in
postcopy.  I don't yet see anywhere multifd flushes pages.  We need to
flush multifd pages before vcpu starts on dest.

As we discussed off-list, we can add an assertion in multifd recv threads
to make sure it won't touch any guest page during active postcopy.

Maybe something like:

diff --git a/migration/multifd.c b/migration/multifd.c
index 4374e14a96..32425137bd 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -1182,6 +1182,13 @@ static void *multifd_recv_thread(void *opaque)
         }
 
         if (has_data) {
+            /*
+             * Now we're going to receive some guest pages into iov
+             * buffers.  If it's during postcopy, it means vcpu can be
+             * running, so this can corrupt memory when modified
+             * concurrently by multifd threads!
+             */
+            assert(!migration_in_postcopy());
             ret = multifd_recv_state->ops->recv(p, &local_err);
             if (ret != 0) {
                 break;

> 
> > The most complicated part of this work would be testing, to make sure it
> > works in all previous cases, and that's majorly why we disabled it before:
> > it was because it randomly crashes, but not always; fundamentally it's
> > because when multifd was designed there wasn't enough consideration on
> > working together with postcopy.  We didn't clearly know what's missing at
> > that time.
> 
> * Right. I have tested this series with the following migration
> commands to confirm that migration works as expected and there were no
> crash(es) observed.
> ===
> [Precopy]
> # virsh migrate --verbose --live --auto-converge f39vm
> qemu+ssh://<destination-host-name>/system
> 
> [Precopy + Multifd]
> # virsh migrate --verbose --live --auto-converge --parallel
> --parallel-connections 02 \
>      f39vm  qemu+ssh://<destination-host-name>/system
> # virsh migrate --verbose --live --auto-converge --parallel
> --parallel-connections 04 \
>      f39vm  qemu+ssh://<destination-host-name>/system
> # virsh migrate --verbose --live --auto-converge --parallel
> --parallel-connections 08 \
>      f39vm  qemu+ssh://<destination-host-name>/system
> # virsh migrate --verbose --live --auto-converge --parallel
> --parallel-connections 16 \
>      f39vm  qemu+ssh://<destination-host-name>/system
> 
> [Postcopy]
> # virsh migrate --verbose --live --auto-converge \
>     --postcopy --postcopy-after-precopy f39vm
> qemu+ssh://<destination-host-name>/system

I'd suggest we try interrupt postcopy with migrate-pause, then go with
postcopy recover.  I wonder how the current series work if the network
failure will fail all the multifd iochannels, and how reconnect works.

> 
> [Postcopy + Multifd]
> # virsh migrate --verbose --live --auto-converge --parallel
> --parallel-connections 02 \
>     --postcopy --postcopy-after-precopy f39vm
> qemu+ssh://<destination-host-name>/system
> # virsh migrate --verbose --live --auto-converge --parallel
> --parallel-connections 04 \
>     --postcopy --postcopy-after-precopy f39vm
> qemu+ssh://<destination-host-name>/system
> # virsh migrate --verbose --live --auto-converge --parallel
> --parallel-connections 08 \
>     --postcopy --postcopy-after-precopy f39vm
> qemu+ssh://<destination-host-name>/system
> # virsh migrate --verbose --live --auto-converge --parallel
> --parallel-connections 16 \
>     --postcopy --postcopy-after-precopy f39vm
> qemu+ssh://<destination-host-name>/system
> ===
> 
> > So we would definitely need to add test cases, just like whoever adds new
> > features to migration, to make sure at least it works for existing multifd
> > / postcopy test cases, but when both features enabled.
> ...
> > It will add quite a few tests to run here, but I don't see a good way
> > otherwise when we want to enable the two features, because it is indeed a
> > challenge to enable the two major features together here.
> >
> 
> * Thank you for the hints about the tests, will surely look into them
> and try to learn about how to add new test cases.

Thanks,

-- 
Peter Xu