From: Prasad Pandit <pjp@fedoraproject.org>
The various logical migration channels don't have a
standardized way of advertising themselves and their
connections may be seen out of order by the migration
destination. When a new connection arrives, the incoming
migration currently make use of heuristics to determine
which channel it belongs to.
The next few patches will need to change how the multifd
and postcopy capabilities interact and that affects the
channel discovery heuristic.
Refactor the channel discovery heuristic to make it less
opaque and simplify the subsequent patches.
Signed-off-by: Prasad Pandit <pjp@fedoraproject.org>
---
migration/migration.c | 124 +++++++++++++++++++++++-------------------
1 file changed, 69 insertions(+), 55 deletions(-)
v8:
- Separate this patch out from earlier patch-2
v7:
- https://lore.kernel.org/qemu-devel/20250228121749.553184-1-ppandit@redhat.com/T/#t
diff --git a/migration/migration.c b/migration/migration.c
index d46e776e24..f97bb2777f 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -95,6 +95,9 @@ enum mig_rp_message_type {
MIG_RP_MSG_MAX
};
+/* Migration channel types */
+enum { CH_MAIN, CH_MULTIFD, CH_POSTCOPY };
+
/* When we add fault tolerance, we could have several
migrations at once. For now we don't need to add
dynamic creation of migration */
@@ -985,28 +988,19 @@ void migration_fd_process_incoming(QEMUFile *f)
migration_incoming_process();
}
-/*
- * Returns true when we want to start a new incoming migration process,
- * false otherwise.
- */
-static bool migration_should_start_incoming(bool main_channel)
+static bool migration_has_main_and_multifd_channels(void)
{
- /* Multifd doesn't start unless all channels are established */
- if (migrate_multifd()) {
- return migration_has_all_channels();
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ if (!mis->from_src_file) {
+ /* main channel not established */
+ return false;
}
- /* Preempt channel only starts when the main channel is created */
- if (migrate_postcopy_preempt()) {
- return main_channel;
+ if (migrate_multifd() && !multifd_recv_all_channels_created()) {
+ return false;
}
- /*
- * For all the rest types of migration, we should only reach here when
- * it's the main channel that's being created, and we should always
- * proceed with this channel.
- */
- assert(main_channel);
+ /* main and all multifd channels are established */
return true;
}
@@ -1015,59 +1009,84 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
MigrationIncomingState *mis = migration_incoming_get_current();
Error *local_err = NULL;
QEMUFile *f;
- bool default_channel = true;
+ uint8_t channel;
uint32_t channel_magic = 0;
int ret = 0;
- if (migrate_multifd() && !migrate_mapped_ram() &&
- !migrate_postcopy_ram() &&
- qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
- /*
- * With multiple channels, it is possible that we receive channels
- * out of order on destination side, causing incorrect mapping of
- * source channels on destination side. Check channel MAGIC to
- * decide type of channel. Please note this is best effort, postcopy
- * preempt channel does not send any magic number so avoid it for
- * postcopy live migration. Also tls live migration already does
- * tls handshake while initializing main channel so with tls this
- * issue is not possible.
- */
- ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
- sizeof(channel_magic), errp);
+ if (!migration_has_main_and_multifd_channels()) {
+ if (qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+ /*
+ * With multiple channels, it is possible that we receive channels
+ * out of order on destination side, causing incorrect mapping of
+ * source channels on destination side. Check channel MAGIC to
+ * decide type of channel. Please note this is best effort,
+ * postcopy preempt channel does not send any magic number so
+ * avoid it for postcopy live migration. Also tls live migration
+ * already does tls handshake while initializing main channel so
+ * with tls this issue is not possible.
+ */
+ ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
+ sizeof(channel_magic), errp);
+ if (ret != 0) {
+ return;
+ }
- if (ret != 0) {
+ channel_magic = be32_to_cpu(channel_magic);
+ if (channel_magic == QEMU_VM_FILE_MAGIC) {
+ channel = CH_MAIN;
+ } else if (channel_magic == MULTIFD_MAGIC) {
+ channel = CH_MULTIFD;
+ } else if (!mis->from_src_file &&
+ mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
+ /* reconnect main channel for postcopy recovery */
+ channel = CH_MAIN;
+ } else {
+ error_setg(errp, "unknown channel magic: %u", channel_magic);
+ return;
+ }
+ } else if (mis->from_src_file && migrate_multifd()) {
+ /*
+ * Non-peekable channels like tls/file are processed as
+ * multifd channels when multifd is enabled.
+ */
+ channel = CH_MULTIFD;
+ } else if (!mis->from_src_file) {
+ channel = CH_MAIN;
+ } else {
+ error_setg(errp, "non-peekable channel used without multifd");
return;
}
-
- default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
+ } else if (mis->from_src_file) {
+ channel = CH_POSTCOPY;
} else {
- default_channel = !mis->from_src_file;
+ channel = CH_MAIN;
}
if (multifd_recv_setup(errp) != 0) {
return;
}
- if (default_channel) {
+ if (channel == CH_MAIN) {
f = qemu_file_new_input(ioc);
migration_incoming_setup(f);
- } else {
+ } else if (channel == CH_MULTIFD) {
/* Multiple connections */
- assert(migration_needs_multiple_sockets());
if (migrate_multifd()) {
multifd_recv_new_channel(ioc, &local_err);
- } else {
- assert(migrate_postcopy_preempt());
- f = qemu_file_new_input(ioc);
- postcopy_preempt_new_channel(mis, f);
}
if (local_err) {
error_propagate(errp, local_err);
return;
}
+ } else if (channel == CH_POSTCOPY) {
+ assert(migrate_postcopy_preempt());
+ assert(!mis->postcopy_qemufile_dst);
+ f = qemu_file_new_input(ioc);
+ postcopy_preempt_new_channel(mis, f);
+ return;
}
- if (migration_should_start_incoming(default_channel)) {
+ if (migration_has_main_and_multifd_channels()) {
/* If it's a recovery, we're done */
if (postcopy_try_recover()) {
return;
@@ -1084,20 +1103,15 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
*/
bool migration_has_all_channels(void)
{
+ if (!migration_has_main_and_multifd_channels()) {
+ return false;
+ }
+
MigrationIncomingState *mis = migration_incoming_get_current();
-
- if (!mis->from_src_file) {
+ if (migrate_postcopy_preempt() && !mis->postcopy_qemufile_dst) {
return false;
}
- if (migrate_multifd()) {
- return multifd_recv_all_channels_created();
- }
-
- if (migrate_postcopy_preempt()) {
- return mis->postcopy_qemufile_dst != NULL;
- }
-
return true;
}
--
2.48.1
Prasad Pandit <ppandit@redhat.com> writes: > From: Prasad Pandit <pjp@fedoraproject.org> > > The various logical migration channels don't have a > standardized way of advertising themselves and their > connections may be seen out of order by the migration > destination. When a new connection arrives, the incoming > migration currently make use of heuristics to determine > which channel it belongs to. > > The next few patches will need to change how the multifd > and postcopy capabilities interact and that affects the > channel discovery heuristic. > > Refactor the channel discovery heuristic to make it less > opaque and simplify the subsequent patches. > > Signed-off-by: Prasad Pandit <pjp@fedoraproject.org> > --- > migration/migration.c | 124 +++++++++++++++++++++++------------------- > 1 file changed, 69 insertions(+), 55 deletions(-) > > v8: > - Separate this patch out from earlier patch-2 > > v7: > - https://lore.kernel.org/qemu-devel/20250228121749.553184-1-ppandit@redhat.com/T/#t > > diff --git a/migration/migration.c b/migration/migration.c > index d46e776e24..f97bb2777f 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -95,6 +95,9 @@ enum mig_rp_message_type { > MIG_RP_MSG_MAX > }; > > +/* Migration channel types */ > +enum { CH_MAIN, CH_MULTIFD, CH_POSTCOPY }; > + > /* When we add fault tolerance, we could have several > migrations at once. For now we don't need to add > dynamic creation of migration */ > @@ -985,28 +988,19 @@ void migration_fd_process_incoming(QEMUFile *f) > migration_incoming_process(); > } > > -/* > - * Returns true when we want to start a new incoming migration process, > - * false otherwise. > - */ > -static bool migration_should_start_incoming(bool main_channel) > +static bool migration_has_main_and_multifd_channels(void) > { > - /* Multifd doesn't start unless all channels are established */ > - if (migrate_multifd()) { > - return migration_has_all_channels(); > + MigrationIncomingState *mis = migration_incoming_get_current(); > + if (!mis->from_src_file) { > + /* main channel not established */ > + return false; > } > > - /* Preempt channel only starts when the main channel is created */ > - if (migrate_postcopy_preempt()) { > - return main_channel; > + if (migrate_multifd() && !multifd_recv_all_channels_created()) { > + return false; > } > > - /* > - * For all the rest types of migration, we should only reach here when > - * it's the main channel that's being created, and we should always > - * proceed with this channel. > - */ > - assert(main_channel); > + /* main and all multifd channels are established */ > return true; > } > > @@ -1015,59 +1009,84 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) > MigrationIncomingState *mis = migration_incoming_get_current(); > Error *local_err = NULL; > QEMUFile *f; > - bool default_channel = true; > + uint8_t channel; > uint32_t channel_magic = 0; > int ret = 0; > > - if (migrate_multifd() && !migrate_mapped_ram() && > - !migrate_postcopy_ram() && > - qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { > - /* > - * With multiple channels, it is possible that we receive channels > - * out of order on destination side, causing incorrect mapping of > - * source channels on destination side. Check channel MAGIC to > - * decide type of channel. Please note this is best effort, postcopy > - * preempt channel does not send any magic number so avoid it for > - * postcopy live migration. Also tls live migration already does > - * tls handshake while initializing main channel so with tls this > - * issue is not possible. > - */ > - ret = migration_channel_read_peek(ioc, (void *)&channel_magic, > - sizeof(channel_magic), errp); > + if (!migration_has_main_and_multifd_channels()) { > + if (qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { > + /* > + * With multiple channels, it is possible that we receive channels > + * out of order on destination side, causing incorrect mapping of > + * source channels on destination side. Check channel MAGIC to > + * decide type of channel. Please note this is best effort, > + * postcopy preempt channel does not send any magic number so > + * avoid it for postcopy live migration. Also tls live migration > + * already does tls handshake while initializing main channel so > + * with tls this issue is not possible. > + */ > + ret = migration_channel_read_peek(ioc, (void *)&channel_magic, > + sizeof(channel_magic), errp); > + if (ret != 0) { > + return; > + } > > - if (ret != 0) { > + channel_magic = be32_to_cpu(channel_magic); > + if (channel_magic == QEMU_VM_FILE_MAGIC) { > + channel = CH_MAIN; > + } else if (channel_magic == MULTIFD_MAGIC) { > + channel = CH_MULTIFD; > + } else if (!mis->from_src_file && > + mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { > + /* reconnect main channel for postcopy recovery */ > + channel = CH_MAIN; > + } else { > + error_setg(errp, "unknown channel magic: %u", channel_magic); > + return; > + } > + } else if (mis->from_src_file && migrate_multifd()) { > + /* > + * Non-peekable channels like tls/file are processed as > + * multifd channels when multifd is enabled. > + */ > + channel = CH_MULTIFD; > + } else if (!mis->from_src_file) { > + channel = CH_MAIN; > + } else { > + error_setg(errp, "non-peekable channel used without multifd"); > return; > } > - > - default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC)); > + } else if (mis->from_src_file) { This is redundant. > + channel = CH_POSTCOPY; > } else { > - default_channel = !mis->from_src_file; > + channel = CH_MAIN; And this is impossible. > } > > if (multifd_recv_setup(errp) != 0) { > return; > } > > - if (default_channel) { > + if (channel == CH_MAIN) { > f = qemu_file_new_input(ioc); > migration_incoming_setup(f); We should probably expand migration_incoming_setup() to make it clear that mis->from_src_file is set at this point. And assert(!mis->from_src_file). I can send a patch on top later. > - } else { > + } else if (channel == CH_MULTIFD) { > /* Multiple connections */ > - assert(migration_needs_multiple_sockets()); > if (migrate_multifd()) { This should be an assert. > multifd_recv_new_channel(ioc, &local_err); > - } else { > - assert(migrate_postcopy_preempt()); > - f = qemu_file_new_input(ioc); > - postcopy_preempt_new_channel(mis, f); > } > if (local_err) { > error_propagate(errp, local_err); > return; > } > + } else if (channel == CH_POSTCOPY) { > + assert(migrate_postcopy_preempt()); > + assert(!mis->postcopy_qemufile_dst); > + f = qemu_file_new_input(ioc); > + postcopy_preempt_new_channel(mis, f); > + return; > } > > - if (migration_should_start_incoming(default_channel)) { > + if (migration_has_main_and_multifd_channels()) { I think there's a bug here. Excluding multifd from the picture, if only the main channel needs to be setup, then it's possible to start postcopy recovery twice, once when the main channel appears and another time when the preempt channel appears. The previous code worked differently because it did: if (migrate_postcopy_preempt()) { return main_channel; which would return false when preempt arrived after main. We could use migration_has_all_channels() instead, that would look more logically correct, but it would also change the current behavior that postcopy recovery can start before the preempt channel is in place. I'm not even sure if that's actually part of the design of the feature. > /* If it's a recovery, we're done */ > if (postcopy_try_recover()) { > return; > @@ -1084,20 +1103,15 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) > */ > bool migration_has_all_channels(void) > { > + if (!migration_has_main_and_multifd_channels()) { > + return false; > + } > + > MigrationIncomingState *mis = migration_incoming_get_current(); > - > - if (!mis->from_src_file) { > + if (migrate_postcopy_preempt() && !mis->postcopy_qemufile_dst) { > return false; > } > > - if (migrate_multifd()) { > - return multifd_recv_all_channels_created(); > - } > - > - if (migrate_postcopy_preempt()) { > - return mis->postcopy_qemufile_dst != NULL; > - } > - > return true; > }
Hello Fabiano, On Mon, 31 Mar 2025 at 20:31, Fabiano Rosas <farosas@suse.de> wrote: > > + } else if (mis->from_src_file) { > This is redundant. * This was to ensure (double check) that when the Postcopy connection comes in, the main channel is established. Also a couple of versions back migration qtest was failing without this check. Nonetheless, qtests do work now without this check. I'll remove it if we must. > > + channel = CH_POSTCOPY; > > } else { > > - default_channel = !mis->from_src_file; > > + channel = CH_MAIN; > > And this is impossible. -> https://lore.kernel.org/qemu-devel/20250215123119.814345-1-ppandit@redhat.com/T/#m18b6bf30e877f9eafaa67bba6a209b47782f6eac * Yes, but a couple of revisions back you suggested adding it saying CH_MAIN assignment at the top was doing some heavy lifting and it's more clear this way. > We should probably expand migration_incoming_setup() to make it clear > that mis->from_src_file is set at this point. And > assert(!mis->from_src_file). I can send a patch on top later. * migration_incoming_setup uses the QEMUFile object only when mis->from_src_file is not set. I'm wondering if we really need an assert(!mis->from_src_file) check? Because it'll reach here only when channel == CH_MAIN and channel is set to CH_MAIN only when mis->from_src_file is NULL. > > - } else { > > + } else if (channel == CH_MULTIFD) { > > /* Multiple connections */ > > - assert(migration_needs_multiple_sockets()); > > if (migrate_multifd()) { > > This should be an assert. Same, 'channel' is set to CH_MULTIFD, only when migrate_multifd() is enabled. Do we need another assert(migrate_multifd()) check? > > + } else if (channel == CH_POSTCOPY) { > > + assert(migrate_postcopy_preempt()); > > + assert(!mis->postcopy_qemufile_dst); > > + f = qemu_file_new_input(ioc); > > + postcopy_preempt_new_channel(mis, f); > > + return; > > } > > > > - if (migration_should_start_incoming(default_channel)) { > > + if (migration_has_main_and_multifd_channels()) { > > I think there's a bug here. Excluding multifd from the picture, if only > the main channel needs to be setup, then it's possible to start postcopy > recovery twice, once when the main channel appears and another time when > the preempt channel appears. * When the preempt channel appears 'channel' is set to CH_POSTCOPY, so it shall 'return' before reaching here, right? === } else if (!mis->from_src_file && mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { /* reconnect main channel for postcopy recovery */ channel = CH_MAIN; } else { === * When 'main' channel connection arrives for postcopy recovery, 'channel' shall be set to CH_MAIN. > The previous code worked differently because it did: > > if (migrate_postcopy_preempt()) { > return main_channel; > > which would return false when preempt arrived after main. * Yes. > We could use migration_has_all_channels() instead, that would look more > logically correct, but it would also change the current behavior that > postcopy recovery can start before the preempt channel is in place. I'm > not even sure if that's actually part of the design of the feature. * Not sure if we need this. Thank you. --- - Prasad
© 2016 - 2025 Red Hat, Inc.