[PATCH 2/4] migration: Introduce VM_STARTED return-path message

Juraj Marcin posted 4 patches 1 week, 3 days ago
Maintainers: Eduardo Habkost <eduardo@habkost.net>, Marcel Apfelbaum <marcel.apfelbaum@gmail.com>, "Philippe Mathieu-Daudé" <philmd@linaro.org>, Yanan Wang <wangyanan55@huawei.com>, Zhao Liu <zhao1.liu@intel.com>, David Hildenbrand <david@kernel.org>, "Michael S. Tsirkin" <mst@redhat.com>, Peter Xu <peterx@redhat.com>, Fabiano Rosas <farosas@suse.de>, Jason Wang <jasowang@redhat.com>, Eric Blake <eblake@redhat.com>, Markus Armbruster <armbru@redhat.com>
[PATCH 2/4] migration: Introduce VM_STARTED return-path message
Posted by Juraj Marcin 1 week, 3 days ago
From: Juraj Marcin <jmarcin@redhat.com>

Currently there is no universal way for the destination to tell the
source it has started. In precopy it could be deduced from the RP_SHUT
message and in postcopy from the response to the ping just before the
POSTCOPY_RUN command, but neither method is precise. Moreover, there is
no way to send more data after the destination has started with precopy
migration.

This patch adds new message type to the return-path which tells the
source that the destination VM has just started (or can be started if
autostart is false). Source VM can use this message to precisely
calculate the downtime regardless of if postcopy is used and can also
send more data, for example network packets.

Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
---
 hw/core/machine.c     |  4 +++-
 migration/migration.c | 34 ++++++++++++++++++++++++++++++----
 migration/migration.h |  9 +++++++++
 migration/options.c   |  8 ++++++++
 migration/options.h   |  1 +
 migration/savevm.c    |  3 +++
 6 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 6411e68856..dc73217a5f 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -38,7 +38,9 @@
 #include "hw/acpi/generic_event_device.h"
 #include "qemu/audio.h"
 
-GlobalProperty hw_compat_10_2[] = {};
+GlobalProperty hw_compat_10_2[] = {
+    { "migration", "send-vm-started", "off" },
+};
 const size_t hw_compat_10_2_len = G_N_ELEMENTS(hw_compat_10_2);
 
 GlobalProperty hw_compat_10_1[] = {
diff --git a/migration/migration.c b/migration/migration.c
index b103a82fc0..4871db2365 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -82,6 +82,7 @@ enum mig_rp_message_type {
     MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
     MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
     MIG_RP_MSG_SWITCHOVER_ACK, /* Tell source it's OK to do switchover */
+    MIG_RP_MSG_VM_STARTED,   /* tell source destination has started */
 
     MIG_RP_MSG_MAX
 };
@@ -750,6 +751,10 @@ static void process_incoming_migration_bh(void *opaque)
         runstate_set(global_state_get_runstate());
     }
     trace_vmstate_downtime_checkpoint("dst-precopy-bh-vm-started");
+    if (mis->to_src_file && migrate_send_vm_started()) {
+        migrate_send_rp_vm_started(mis);
+    }
+
     /*
      * This must happen after any state changes since as soon as an external
      * observer sees this event they might start to prod at the VM assuming
@@ -996,6 +1001,11 @@ void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
     migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
 }
 
+void migrate_send_rp_vm_started(MigrationIncomingState *mis)
+{
+    migrate_send_rp_message(mis, MIG_RP_MSG_VM_STARTED, 0, NULL);
+}
+
 bool migration_is_running(void)
 {
     MigrationState *s = current_migration;
@@ -1660,6 +1670,9 @@ int migrate_init(MigrationState *s, Error **errp)
     s->postcopy_package_loaded = false;
     qemu_event_reset(&s->postcopy_package_loaded_event);
 
+    s->dest_vm_started = false;
+    qemu_event_reset(&s->dest_vm_started_event);
+
     return 0;
 }
 
@@ -2368,6 +2381,12 @@ static void *source_return_path_thread(void *opaque)
             trace_source_return_path_thread_switchover_acked();
             break;
 
+        case MIG_RP_MSG_VM_STARTED:
+            migration_downtime_end(ms);
+            ms->dest_vm_started = true;
+            qemu_event_set(&ms->dest_vm_started_event);
+            break;
+
         default:
             break;
         }
@@ -2591,7 +2610,9 @@ static int postcopy_start(MigrationState *ms, Error **errp)
      */
     migration_call_notifiers(MIG_EVENT_PRECOPY_DONE, NULL);
 
-    migration_downtime_end(ms);
+    if (!ms->rp_state.rp_thread_created || !migrate_send_vm_started()) {
+        migration_downtime_end(ms);
+    }
 
     if (migrate_postcopy_ram()) {
         /*
@@ -3086,7 +3107,9 @@ static void migration_completion_end(MigrationState *s)
      * - correct ordering of s->mbps update vs. s->state;
      */
     bql_lock();
-    migration_downtime_end(s);
+    if (!s->rp_state.rp_thread_created || !migrate_send_vm_started()) {
+        migration_downtime_end(s);
+    }
     s->total_time = end_time - s->start_time;
     transfer_time = s->total_time - s->setup_time;
     if (transfer_time) {
@@ -3300,9 +3323,10 @@ static void migration_iteration_finish(MigrationState *s)
     case MIGRATION_STATUS_FAILED:
     case MIGRATION_STATUS_CANCELLED:
     case MIGRATION_STATUS_CANCELLING:
-        if (!migration_block_activate(&local_err)) {
+        if (s->dest_vm_started || !migration_block_activate(&local_err)) {
             /*
-            * Re-activate the block drives if they're inactivated.
+            * Re-activate the block drives if they're inactivated and the dest
+            * vm has not reported that it has started.
             *
             * If it fails (e.g. in case of a split brain, where dest QEMU
             * might have taken some of the drive locks and running!), do
@@ -3853,6 +3877,7 @@ static void migration_instance_finalize(Object *obj)
     qemu_sem_destroy(&ms->postcopy_qemufile_src_sem);
     error_free(ms->error);
     qemu_event_destroy(&ms->postcopy_package_loaded_event);
+    qemu_event_destroy(&ms->dest_vm_started_event);
 }
 
 static void migration_instance_init(Object *obj)
@@ -3875,6 +3900,7 @@ static void migration_instance_init(Object *obj)
     qemu_sem_init(&ms->postcopy_qemufile_src_sem, 0);
     qemu_mutex_init(&ms->qemu_file_lock);
     qemu_event_init(&ms->postcopy_package_loaded_event, 0);
+    qemu_event_init(&ms->dest_vm_started_event, false);
 }
 
 /*
diff --git a/migration/migration.h b/migration/migration.h
index b6888daced..a3fab4f27e 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -522,6 +522,14 @@ struct MigrationState {
      * anything as input.
      */
     bool has_block_bitmap_mapping;
+
+    /*
+     * Do send VM_START message on the return-path when dest VM finishes
+     * loading device state and switches out of INMIGRATE run state.
+     */
+    bool send_vm_started;
+    bool dest_vm_started;
+    QemuEvent dest_vm_started_event;
 };
 
 void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
@@ -564,6 +572,7 @@ void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
                                  char *block_name);
 void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value);
 int migrate_send_rp_switchover_ack(MigrationIncomingState *mis);
+void migrate_send_rp_vm_started(MigrationIncomingState *mis);
 
 void dirty_bitmap_mig_before_vm_start(void);
 void dirty_bitmap_mig_cancel_outgoing(void);
diff --git a/migration/options.c b/migration/options.c
index 1ffe85a2d8..a5a233183b 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -108,6 +108,7 @@ const Property migration_properties[] = {
                      preempt_pre_7_2, false),
     DEFINE_PROP_BOOL("multifd-clean-tls-termination", MigrationState,
                      multifd_clean_tls_termination, true),
+    DEFINE_PROP_BOOL("send-vm-started", MigrationState, send_vm_started, true),
 
     /* Migration parameters */
     DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
@@ -434,6 +435,13 @@ bool migrate_zero_copy_send(void)
     return s->capabilities[MIGRATION_CAPABILITY_ZERO_COPY_SEND];
 }
 
+bool migrate_send_vm_started(void)
+{
+    MigrationState *s = migrate_get_current();
+
+    return s->send_vm_started;
+}
+
 /* pseudo capabilities */
 
 bool migrate_multifd_flush_after_each_section(void)
diff --git a/migration/options.h b/migration/options.h
index b502871097..5fdc8fc6fe 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -42,6 +42,7 @@ bool migrate_return_path(void);
 bool migrate_validate_uuid(void);
 bool migrate_xbzrle(void);
 bool migrate_zero_copy_send(void);
+bool migrate_send_vm_started(void);
 
 /*
  * pseudo capabilities
diff --git a/migration/savevm.c b/migration/savevm.c
index 3dc812a7bb..1020094fc8 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2157,6 +2157,9 @@ static void loadvm_postcopy_handle_run_bh(void *opaque)
     }
 
     trace_vmstate_downtime_checkpoint("dst-postcopy-bh-vm-started");
+    if (mis->to_src_file && migrate_send_vm_started()) {
+        migrate_send_rp_vm_started(mis);
+    }
 }
 
 /* After all discards we can start running and asking for pages */
-- 
2.52.0
Re: [PATCH 2/4] migration: Introduce VM_STARTED return-path message
Posted by Michael S. Tsirkin 1 week, 3 days ago
On Tue, Jan 27, 2026 at 03:03:08PM +0100, Juraj Marcin wrote:
> From: Juraj Marcin <jmarcin@redhat.com>
> 
> Currently there is no universal way for the destination to tell the
> source it has started. In precopy it could be deduced from the RP_SHUT
> message and in postcopy from the response to the ping just before the
> POSTCOPY_RUN command, but neither method is precise. Moreover, there is
> no way to send more data after the destination has started with precopy
> migration.
> 
> This patch adds new message type to the return-path which tells the
> source that the destination VM has just started (or can be started if
> autostart is false). Source VM can use this message to precisely
> calculate the downtime regardless of if postcopy is used and can also
> send more data, for example network packets.
> 
> Signed-off-by: Juraj Marcin <jmarcin@redhat.com>


I do not think it matters that VM started, at least not for
the issue in question.

What matters is that a packet is transmitted on behalf of the VM,
on the specific interface.



> ---
>  hw/core/machine.c     |  4 +++-
>  migration/migration.c | 34 ++++++++++++++++++++++++++++++----
>  migration/migration.h |  9 +++++++++
>  migration/options.c   |  8 ++++++++
>  migration/options.h   |  1 +
>  migration/savevm.c    |  3 +++
>  6 files changed, 54 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index 6411e68856..dc73217a5f 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -38,7 +38,9 @@
>  #include "hw/acpi/generic_event_device.h"
>  #include "qemu/audio.h"
>  
> -GlobalProperty hw_compat_10_2[] = {};
> +GlobalProperty hw_compat_10_2[] = {
> +    { "migration", "send-vm-started", "off" },
> +};
>  const size_t hw_compat_10_2_len = G_N_ELEMENTS(hw_compat_10_2);
>  
>  GlobalProperty hw_compat_10_1[] = {
> diff --git a/migration/migration.c b/migration/migration.c
> index b103a82fc0..4871db2365 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -82,6 +82,7 @@ enum mig_rp_message_type {
>      MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
>      MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
>      MIG_RP_MSG_SWITCHOVER_ACK, /* Tell source it's OK to do switchover */
> +    MIG_RP_MSG_VM_STARTED,   /* tell source destination has started */
>  
>      MIG_RP_MSG_MAX
>  };
> @@ -750,6 +751,10 @@ static void process_incoming_migration_bh(void *opaque)
>          runstate_set(global_state_get_runstate());
>      }
>      trace_vmstate_downtime_checkpoint("dst-precopy-bh-vm-started");
> +    if (mis->to_src_file && migrate_send_vm_started()) {
> +        migrate_send_rp_vm_started(mis);
> +    }
> +
>      /*
>       * This must happen after any state changes since as soon as an external
>       * observer sees this event they might start to prod at the VM assuming
> @@ -996,6 +1001,11 @@ void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
>      migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
>  }
>  
> +void migrate_send_rp_vm_started(MigrationIncomingState *mis)
> +{
> +    migrate_send_rp_message(mis, MIG_RP_MSG_VM_STARTED, 0, NULL);
> +}
> +
>  bool migration_is_running(void)
>  {
>      MigrationState *s = current_migration;
> @@ -1660,6 +1670,9 @@ int migrate_init(MigrationState *s, Error **errp)
>      s->postcopy_package_loaded = false;
>      qemu_event_reset(&s->postcopy_package_loaded_event);
>  
> +    s->dest_vm_started = false;
> +    qemu_event_reset(&s->dest_vm_started_event);
> +
>      return 0;
>  }
>  
> @@ -2368,6 +2381,12 @@ static void *source_return_path_thread(void *opaque)
>              trace_source_return_path_thread_switchover_acked();
>              break;
>  
> +        case MIG_RP_MSG_VM_STARTED:
> +            migration_downtime_end(ms);
> +            ms->dest_vm_started = true;
> +            qemu_event_set(&ms->dest_vm_started_event);
> +            break;
> +
>          default:
>              break;
>          }
> @@ -2591,7 +2610,9 @@ static int postcopy_start(MigrationState *ms, Error **errp)
>       */
>      migration_call_notifiers(MIG_EVENT_PRECOPY_DONE, NULL);
>  
> -    migration_downtime_end(ms);
> +    if (!ms->rp_state.rp_thread_created || !migrate_send_vm_started()) {
> +        migration_downtime_end(ms);
> +    }
>  
>      if (migrate_postcopy_ram()) {
>          /*
> @@ -3086,7 +3107,9 @@ static void migration_completion_end(MigrationState *s)
>       * - correct ordering of s->mbps update vs. s->state;
>       */
>      bql_lock();
> -    migration_downtime_end(s);
> +    if (!s->rp_state.rp_thread_created || !migrate_send_vm_started()) {
> +        migration_downtime_end(s);
> +    }
>      s->total_time = end_time - s->start_time;
>      transfer_time = s->total_time - s->setup_time;
>      if (transfer_time) {
> @@ -3300,9 +3323,10 @@ static void migration_iteration_finish(MigrationState *s)
>      case MIGRATION_STATUS_FAILED:
>      case MIGRATION_STATUS_CANCELLED:
>      case MIGRATION_STATUS_CANCELLING:
> -        if (!migration_block_activate(&local_err)) {
> +        if (s->dest_vm_started || !migration_block_activate(&local_err)) {
>              /*
> -            * Re-activate the block drives if they're inactivated.
> +            * Re-activate the block drives if they're inactivated and the dest
> +            * vm has not reported that it has started.
>              *
>              * If it fails (e.g. in case of a split brain, where dest QEMU
>              * might have taken some of the drive locks and running!), do
> @@ -3853,6 +3877,7 @@ static void migration_instance_finalize(Object *obj)
>      qemu_sem_destroy(&ms->postcopy_qemufile_src_sem);
>      error_free(ms->error);
>      qemu_event_destroy(&ms->postcopy_package_loaded_event);
> +    qemu_event_destroy(&ms->dest_vm_started_event);
>  }
>  
>  static void migration_instance_init(Object *obj)
> @@ -3875,6 +3900,7 @@ static void migration_instance_init(Object *obj)
>      qemu_sem_init(&ms->postcopy_qemufile_src_sem, 0);
>      qemu_mutex_init(&ms->qemu_file_lock);
>      qemu_event_init(&ms->postcopy_package_loaded_event, 0);
> +    qemu_event_init(&ms->dest_vm_started_event, false);
>  }
>  
>  /*
> diff --git a/migration/migration.h b/migration/migration.h
> index b6888daced..a3fab4f27e 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -522,6 +522,14 @@ struct MigrationState {
>       * anything as input.
>       */
>      bool has_block_bitmap_mapping;
> +
> +    /*
> +     * Do send VM_START message on the return-path when dest VM finishes
> +     * loading device state and switches out of INMIGRATE run state.
> +     */
> +    bool send_vm_started;
> +    bool dest_vm_started;
> +    QemuEvent dest_vm_started_event;
>  };
>  
>  void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
> @@ -564,6 +572,7 @@ void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
>                                   char *block_name);
>  void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value);
>  int migrate_send_rp_switchover_ack(MigrationIncomingState *mis);
> +void migrate_send_rp_vm_started(MigrationIncomingState *mis);
>  
>  void dirty_bitmap_mig_before_vm_start(void);
>  void dirty_bitmap_mig_cancel_outgoing(void);
> diff --git a/migration/options.c b/migration/options.c
> index 1ffe85a2d8..a5a233183b 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -108,6 +108,7 @@ const Property migration_properties[] = {
>                       preempt_pre_7_2, false),
>      DEFINE_PROP_BOOL("multifd-clean-tls-termination", MigrationState,
>                       multifd_clean_tls_termination, true),
> +    DEFINE_PROP_BOOL("send-vm-started", MigrationState, send_vm_started, true),
>  
>      /* Migration parameters */
>      DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
> @@ -434,6 +435,13 @@ bool migrate_zero_copy_send(void)
>      return s->capabilities[MIGRATION_CAPABILITY_ZERO_COPY_SEND];
>  }
>  
> +bool migrate_send_vm_started(void)
> +{
> +    MigrationState *s = migrate_get_current();
> +
> +    return s->send_vm_started;
> +}
> +
>  /* pseudo capabilities */
>  
>  bool migrate_multifd_flush_after_each_section(void)
> diff --git a/migration/options.h b/migration/options.h
> index b502871097..5fdc8fc6fe 100644
> --- a/migration/options.h
> +++ b/migration/options.h
> @@ -42,6 +42,7 @@ bool migrate_return_path(void);
>  bool migrate_validate_uuid(void);
>  bool migrate_xbzrle(void);
>  bool migrate_zero_copy_send(void);
> +bool migrate_send_vm_started(void);
>  
>  /*
>   * pseudo capabilities
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 3dc812a7bb..1020094fc8 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -2157,6 +2157,9 @@ static void loadvm_postcopy_handle_run_bh(void *opaque)
>      }
>  
>      trace_vmstate_downtime_checkpoint("dst-postcopy-bh-vm-started");
> +    if (mis->to_src_file && migrate_send_vm_started()) {
> +        migrate_send_rp_vm_started(mis);
> +    }
>  }
>  
>  /* After all discards we can start running and asking for pages */
> -- 
> 2.52.0