From: Juraj Marcin <jmarcin@redhat.com>
Currently, when postcopy starts, the source VM starts switchover and
sends a package containing the state of all non-postcopiable devices.
When the destination loads this package, the switchover is complete and
the destination VM starts. However, if the device state load fails or
the destination side crashes, the source side is already in
POSTCOPY_ACTIVE state and cannot be recovered, even when it has the most
up-to-date machine state as the destination has not yet started.
This patch introduces a new POSTCOPY_DEVICE state which is active
while the destination machine is loading the device state, is not yet
running, and the source side can be resumed in case of a migration
failure.
To transition from POSTCOPY_DEVICE to POSTCOPY_ACTIVE, the source
side uses a PONG message that is a response to a PING message processed
just before the POSTCOPY_RUN command that starts the destination VM.
Thus, this change does not require any changes on the destination side
and is effective even with older destination versions.
Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
---
migration/migration.c | 23 ++++++++++++++++++-----
migration/savevm.h | 2 ++
migration/trace-events | 1 +
qapi/migration.json | 8 ++++++--
tests/qtest/migration/precopy-tests.c | 3 ++-
5 files changed, 29 insertions(+), 8 deletions(-)
diff --git a/migration/migration.c b/migration/migration.c
index 7222e3de13..e63a7487be 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1223,6 +1223,7 @@ bool migration_is_running(void)
switch (s->state) {
case MIGRATION_STATUS_ACTIVE:
+ case MIGRATION_STATUS_POSTCOPY_DEVICE:
case MIGRATION_STATUS_POSTCOPY_ACTIVE:
case MIGRATION_STATUS_POSTCOPY_PAUSED:
case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
@@ -1244,6 +1245,7 @@ static bool migration_is_active(void)
MigrationState *s = current_migration;
return (s->state == MIGRATION_STATUS_ACTIVE ||
+ s->state == MIGRATION_STATUS_POSTCOPY_DEVICE ||
s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
}
@@ -1366,6 +1368,7 @@ static void fill_source_migration_info(MigrationInfo *info)
break;
case MIGRATION_STATUS_ACTIVE:
case MIGRATION_STATUS_CANCELLING:
+ case MIGRATION_STATUS_POSTCOPY_DEVICE:
case MIGRATION_STATUS_POSTCOPY_ACTIVE:
case MIGRATION_STATUS_PRE_SWITCHOVER:
case MIGRATION_STATUS_DEVICE:
@@ -1419,6 +1422,7 @@ static void fill_destination_migration_info(MigrationInfo *info)
case MIGRATION_STATUS_CANCELLING:
case MIGRATION_STATUS_CANCELLED:
case MIGRATION_STATUS_ACTIVE:
+ case MIGRATION_STATUS_POSTCOPY_DEVICE:
case MIGRATION_STATUS_POSTCOPY_ACTIVE:
case MIGRATION_STATUS_POSTCOPY_PAUSED:
case MIGRATION_STATUS_POSTCOPY_RECOVER:
@@ -1719,6 +1723,7 @@ bool migration_in_postcopy(void)
MigrationState *s = migrate_get_current();
switch (s->state) {
+ case MIGRATION_STATUS_POSTCOPY_DEVICE:
case MIGRATION_STATUS_POSTCOPY_ACTIVE:
case MIGRATION_STATUS_POSTCOPY_PAUSED:
case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
@@ -2564,6 +2569,11 @@ static void *source_return_path_thread(void *opaque)
tmp32 = ldl_be_p(buf);
trace_source_return_path_thread_pong(tmp32);
qemu_sem_post(&ms->rp_state.rp_pong_acks);
+ if (tmp32 == QEMU_VM_PING_PACKAGED_LOADED) {
+ trace_source_return_path_thread_dst_started();
+ migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_DEVICE,
+ MIGRATION_STATUS_POSTCOPY_ACTIVE);
+ }
break;
case MIG_RP_MSG_REQ_PAGES:
@@ -2814,6 +2824,7 @@ static int postcopy_start(MigrationState *ms, Error **errp)
if (migrate_postcopy_ram()) {
qemu_savevm_send_ping(fb, 3);
}
+ qemu_savevm_send_ping(fb, QEMU_VM_PING_PACKAGED_LOADED);
qemu_savevm_send_postcopy_run(fb);
@@ -2871,7 +2882,7 @@ static int postcopy_start(MigrationState *ms, Error **errp)
/* Now, switchover looks all fine, switching to postcopy-active */
migrate_set_state(&ms->state, MIGRATION_STATUS_DEVICE,
- MIGRATION_STATUS_POSTCOPY_ACTIVE);
+ MIGRATION_STATUS_POSTCOPY_DEVICE);
bql_unlock();
@@ -3035,7 +3046,8 @@ static void migration_completion(MigrationState *s)
if (s->state == MIGRATION_STATUS_ACTIVE) {
ret = migration_completion_precopy(s);
- } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+ } else if (s->state == MIGRATION_STATUS_POSTCOPY_DEVICE ||
+ s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
migration_completion_postcopy(s);
} else {
ret = -1;
@@ -3311,8 +3323,8 @@ static MigThrError migration_detect_error(MigrationState *s)
return postcopy_pause(s);
} else {
/*
- * For precopy (or postcopy with error outside IO), we fail
- * with no time.
+ * For precopy (or postcopy with error outside IO, or before dest
+ * starts), we fail with no time.
*/
migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED);
trace_migration_thread_file_err();
@@ -3447,7 +3459,8 @@ static MigIterateState migration_iteration_run(MigrationState *s)
{
uint64_t must_precopy, can_postcopy, pending_size;
Error *local_err = NULL;
- bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
+ bool in_postcopy = (s->state == MIGRATION_STATUS_POSTCOPY_DEVICE ||
+ s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
bool can_switchover = migration_can_switchover(s);
bool complete_ready;
diff --git a/migration/savevm.h b/migration/savevm.h
index 2d5e9c7166..c4de0325eb 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -29,6 +29,8 @@
#define QEMU_VM_COMMAND 0x08
#define QEMU_VM_SECTION_FOOTER 0x7e
+#define QEMU_VM_PING_PACKAGED_LOADED 0x42
+
bool qemu_savevm_state_blocked(Error **errp);
void qemu_savevm_non_migratable_list(strList **reasons);
int qemu_savevm_state_prepare(Error **errp);
diff --git a/migration/trace-events b/migration/trace-events
index 706db97def..007b5c407e 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -191,6 +191,7 @@ source_return_path_thread_pong(uint32_t val) "0x%x"
source_return_path_thread_shut(uint32_t val) "0x%x"
source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32
source_return_path_thread_switchover_acked(void) ""
+source_return_path_thread_dst_started(void) ""
migration_thread_low_pending(uint64_t pending) "%" PRIu64
migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t bandwidth, uint64_t avail_bw, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " switchover_bw %" PRIu64 " max_size %" PRId64
process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
diff --git a/qapi/migration.json b/qapi/migration.json
index 2387c21e9c..89a20d858d 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -142,6 +142,10 @@
# @postcopy-active: like active, but now in postcopy mode.
# (since 2.5)
#
+# @postcopy-device: like postcopy-active, but the destination is still
+# loading device state and is not running yet. If migration fails
+# during this state, the source side will resume. (since 10.2)
+#
# @postcopy-paused: during postcopy but paused. (since 3.0)
#
# @postcopy-recover-setup: setup phase for a postcopy recovery
@@ -173,8 +177,8 @@
##
{ 'enum': 'MigrationStatus',
'data': [ 'none', 'setup', 'cancelling', 'cancelled',
- 'active', 'postcopy-active', 'postcopy-paused',
- 'postcopy-recover-setup',
+ 'active', 'postcopy-device', 'postcopy-active',
+ 'postcopy-paused', 'postcopy-recover-setup',
'postcopy-recover', 'completed', 'failed', 'colo',
'pre-switchover', 'device', 'wait-unplug' ] }
##
diff --git a/tests/qtest/migration/precopy-tests.c b/tests/qtest/migration/precopy-tests.c
index bb38292550..57ca623de5 100644
--- a/tests/qtest/migration/precopy-tests.c
+++ b/tests/qtest/migration/precopy-tests.c
@@ -1316,13 +1316,14 @@ void migration_test_add_precopy(MigrationTestEnv *env)
}
/* ensure new status don't go unnoticed */
- assert(MIGRATION_STATUS__MAX == 15);
+ assert(MIGRATION_STATUS__MAX == 16);
for (int i = MIGRATION_STATUS_NONE; i < MIGRATION_STATUS__MAX; i++) {
switch (i) {
case MIGRATION_STATUS_DEVICE: /* happens too fast */
case MIGRATION_STATUS_WAIT_UNPLUG: /* no support in tests */
case MIGRATION_STATUS_COLO: /* no support in tests */
+ case MIGRATION_STATUS_POSTCOPY_DEVICE: /* postcopy can't be cancelled */
case MIGRATION_STATUS_POSTCOPY_ACTIVE: /* postcopy can't be cancelled */
case MIGRATION_STATUS_POSTCOPY_PAUSED:
case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
--
2.51.0
On Mon, Sep 15, 2025 at 13:59:15 +0200, Juraj Marcin wrote: > From: Juraj Marcin <jmarcin@redhat.com> > > Currently, when postcopy starts, the source VM starts switchover and > sends a package containing the state of all non-postcopiable devices. > When the destination loads this package, the switchover is complete and > the destination VM starts. However, if the device state load fails or > the destination side crashes, the source side is already in > POSTCOPY_ACTIVE state and cannot be recovered, even when it has the most > up-to-date machine state as the destination has not yet started. > > This patch introduces a new POSTCOPY_DEVICE state which is active > while the destination machine is loading the device state, is not yet > running, and the source side can be resumed in case of a migration > failure. > > To transition from POSTCOPY_DEVICE to POSTCOPY_ACTIVE, the source > side uses a PONG message that is a response to a PING message processed > just before the POSTCOPY_RUN command that starts the destination VM. > Thus, this change does not require any changes on the destination side > and is effective even with older destination versions. Thanks, this will help libvirt as we think that the migration can be safely aborted unless we successfully called "cont" and thus we just kill QEMU on the destination. But since QEMU on the source already entered postcopy-active, we can't cancel the migration and the result is a paused VM with no way of recovering it. This series will make the situation better as the source will stay in postcopy-device until the destination successfully loads device data. There's still room for some enhancement though. Depending on how fast this loading is libvirt may issue cont before device data is loaded (the destination is already in postcopy-active at this point), which always succeeds as it only marks the domain to be autostarted, but the actual start may fail later. When discussing this with Juraj we agreed on introducing the new postcopy-device state on the destination as well to make sure libvirt will only call cont once device data was successfully loaded so that we always get a proper result when running cont. But it may still fail when locking disks fails (not sure if this is the only way cont may fail). In this case we cannot cancel the migration on the source as it is already in postcopy-active and we can't recover migration either as the CPUs are not running on the destination. Ideally we'd have a way of canceling the migration in postocpy-active if we are sure CPUs were not started yet. Alternatively a possibility to recover migration would work as well. Jirka
On Thu, Sep 25, 2025 at 01:54:40PM +0200, Jiří Denemark wrote: > On Mon, Sep 15, 2025 at 13:59:15 +0200, Juraj Marcin wrote: > > From: Juraj Marcin <jmarcin@redhat.com> > > > > Currently, when postcopy starts, the source VM starts switchover and > > sends a package containing the state of all non-postcopiable devices. > > When the destination loads this package, the switchover is complete and > > the destination VM starts. However, if the device state load fails or > > the destination side crashes, the source side is already in > > POSTCOPY_ACTIVE state and cannot be recovered, even when it has the most > > up-to-date machine state as the destination has not yet started. > > > > This patch introduces a new POSTCOPY_DEVICE state which is active > > while the destination machine is loading the device state, is not yet > > running, and the source side can be resumed in case of a migration > > failure. > > > > To transition from POSTCOPY_DEVICE to POSTCOPY_ACTIVE, the source > > side uses a PONG message that is a response to a PING message processed > > just before the POSTCOPY_RUN command that starts the destination VM. > > Thus, this change does not require any changes on the destination side > > and is effective even with older destination versions. > > Thanks, this will help libvirt as we think that the migration can be > safely aborted unless we successfully called "cont" and thus we just > kill QEMU on the destination. But since QEMU on the source already > entered postcopy-active, we can't cancel the migration and the result is > a paused VM with no way of recovering it. > > This series will make the situation better as the source will stay in > postcopy-device until the destination successfully loads device data. > There's still room for some enhancement though. Depending on how fast > this loading is libvirt may issue cont before device data is loaded (the > destination is already in postcopy-active at this point), which always > succeeds as it only marks the domain to be autostarted, but the actual > start may fail later. When discussing this with Juraj we agreed on > introducing the new postcopy-device state on the destination as well to I used to think and define postcopy-active be the state we should never be able to cancel it anymore, implying that the real postcopy process is in progress, and also implying the state where we need to start assume the latest VM pages are spread on both sides, not one anymore. Cancellation or killing either side means crashing VM then. It could be a good thing indeed to have postcopy-device on dest too from that regard, because having postcopy-device on dest can mark out the small initial window when dest qemu hasn't yet start to generate new data but only applying old data (device data, which src also owns a copy). From that POV, that indeed does not belong to the point if we define postcopy-active as above. IOW, also with such definition, setting postcopy-active on dest QEMU right at the entry of ram load thread (what we do right now..) is too early. > make sure libvirt will only call cont once device data was successfully > loaded so that we always get a proper result when running cont. But it Do we know an estimate of how much extra downtime this would introduce? We should have discussed this in a previous thread, the issue is if we cont only after device loaded, then dest QEMU may need to wait a while until it receives the cont from libvirt, that will contribute to the downtime. It would best be avoided, or if it's negligible then it's fine too but I'm not sure whether it's guaranteed to be negligible.. If the goal is to make sure libvirt knows what is happening, can it still relies on the event emitted, in this case, RESUME? We can also reorg how postcopy-device and postcopy-active states will be reported on dest, then they'll help if RESUME is too coarse grained. So far, dest QEMU will try to resume the VM after getting RUN command, that is what loadvm_postcopy_handle_run_bh() does, and it will (when autostart=1 set): (1) firstly try to activate all block devices, iff it succeeded, (2) do vm_start(), at the end of which RESUME event will be generated. So RESUME currently implies both disk activation success, and vm start worked. > may still fail when locking disks fails (not sure if this is the only > way cont may fail). In this case we cannot cancel the migration on the Is there any known issue with locking disks that dest would fail? This really sound like we should have the admin taking a look. I recall Juraj mentioned off list that drive lock sometimes will stop working. Is that relevant here? > source as it is already in postcopy-active and we can't recover > migration either as the CPUs are not running on the destination. Ideally > we'd have a way of canceling the migration in postocpy-active if we are > sure CPUs were not started yet. Alternatively a possibility to recover > migration would work as well. Thanks, -- Peter Xu
On Mon, Sep 15, 2025 at 01:59:15PM +0200, Juraj Marcin wrote: > From: Juraj Marcin <jmarcin@redhat.com> > > Currently, when postcopy starts, the source VM starts switchover and > sends a package containing the state of all non-postcopiable devices. > When the destination loads this package, the switchover is complete and > the destination VM starts. However, if the device state load fails or > the destination side crashes, the source side is already in > POSTCOPY_ACTIVE state and cannot be recovered, even when it has the most > up-to-date machine state as the destination has not yet started. > > This patch introduces a new POSTCOPY_DEVICE state which is active > while the destination machine is loading the device state, is not yet > running, and the source side can be resumed in case of a migration > failure. > > To transition from POSTCOPY_DEVICE to POSTCOPY_ACTIVE, the source > side uses a PONG message that is a response to a PING message processed > just before the POSTCOPY_RUN command that starts the destination VM. > Thus, this change does not require any changes on the destination side > and is effective even with older destination versions. > > Signed-off-by: Juraj Marcin <jmarcin@redhat.com> > --- > migration/migration.c | 23 ++++++++++++++++++----- > migration/savevm.h | 2 ++ > migration/trace-events | 1 + > qapi/migration.json | 8 ++++++-- > tests/qtest/migration/precopy-tests.c | 3 ++- > 5 files changed, 29 insertions(+), 8 deletions(-) > > diff --git a/migration/migration.c b/migration/migration.c > index 7222e3de13..e63a7487be 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -1223,6 +1223,7 @@ bool migration_is_running(void) > > switch (s->state) { > case MIGRATION_STATUS_ACTIVE: > + case MIGRATION_STATUS_POSTCOPY_DEVICE: > case MIGRATION_STATUS_POSTCOPY_ACTIVE: > case MIGRATION_STATUS_POSTCOPY_PAUSED: > case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: > @@ -1244,6 +1245,7 @@ static bool migration_is_active(void) > MigrationState *s = current_migration; > > return (s->state == MIGRATION_STATUS_ACTIVE || > + s->state == MIGRATION_STATUS_POSTCOPY_DEVICE || > s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); > } > > @@ -1366,6 +1368,7 @@ static void fill_source_migration_info(MigrationInfo *info) > break; > case MIGRATION_STATUS_ACTIVE: > case MIGRATION_STATUS_CANCELLING: > + case MIGRATION_STATUS_POSTCOPY_DEVICE: > case MIGRATION_STATUS_POSTCOPY_ACTIVE: > case MIGRATION_STATUS_PRE_SWITCHOVER: > case MIGRATION_STATUS_DEVICE: > @@ -1419,6 +1422,7 @@ static void fill_destination_migration_info(MigrationInfo *info) > case MIGRATION_STATUS_CANCELLING: > case MIGRATION_STATUS_CANCELLED: > case MIGRATION_STATUS_ACTIVE: > + case MIGRATION_STATUS_POSTCOPY_DEVICE: > case MIGRATION_STATUS_POSTCOPY_ACTIVE: > case MIGRATION_STATUS_POSTCOPY_PAUSED: > case MIGRATION_STATUS_POSTCOPY_RECOVER: > @@ -1719,6 +1723,7 @@ bool migration_in_postcopy(void) > MigrationState *s = migrate_get_current(); > > switch (s->state) { > + case MIGRATION_STATUS_POSTCOPY_DEVICE: > case MIGRATION_STATUS_POSTCOPY_ACTIVE: > case MIGRATION_STATUS_POSTCOPY_PAUSED: > case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: > @@ -2564,6 +2569,11 @@ static void *source_return_path_thread(void *opaque) > tmp32 = ldl_be_p(buf); > trace_source_return_path_thread_pong(tmp32); > qemu_sem_post(&ms->rp_state.rp_pong_acks); > + if (tmp32 == QEMU_VM_PING_PACKAGED_LOADED) { > + trace_source_return_path_thread_dst_started(); > + migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_DEVICE, > + MIGRATION_STATUS_POSTCOPY_ACTIVE); Could this race with the migration thread modifying the state concurrently? To avoid it, we could have a bool, set it here once, and in the iterations do something like: diff --git a/migration/migration.c b/migration/migration.c index 10c216d25d..55230e10ee 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3449,6 +3449,16 @@ static MigIterateState migration_iteration_run(MigrationState *s) trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy); if (in_postcopy) { + if (s->postcopy_package_loaded) { + assert(s->state == MIGRATION_STATUS_POSTCOPY_DEVICE); + migrate_set_state(s->state, MIGRATION_STATUS_POSTCOPY_DEVICE, + MIGRATION_STATUS_POSTCOPY_ACTIVE); + /* + * Since postcopy cannot be re-initiated, this flag will only + * be set at most once for QEMU's whole lifecyce. + */ + s->postcopy_package_loaded = false; + } /* * Iterate in postcopy until all pending data flushed. Note that * postcopy completion doesn't rely on can_switchover, because when > + } > break; > > case MIG_RP_MSG_REQ_PAGES: > @@ -2814,6 +2824,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) > if (migrate_postcopy_ram()) { > qemu_savevm_send_ping(fb, 3); > } > + qemu_savevm_send_ping(fb, QEMU_VM_PING_PACKAGED_LOADED); Nitpick: some comment would be nice here describing this ping, especially when it becomes functional. The name does provide some info, but not extremely clear what PACKAGED_LOADED mean if in a PONG yet. > > qemu_savevm_send_postcopy_run(fb); > > @@ -2871,7 +2882,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) > > /* Now, switchover looks all fine, switching to postcopy-active */ > migrate_set_state(&ms->state, MIGRATION_STATUS_DEVICE, > - MIGRATION_STATUS_POSTCOPY_ACTIVE); > + MIGRATION_STATUS_POSTCOPY_DEVICE); > > bql_unlock(); > > @@ -3035,7 +3046,8 @@ static void migration_completion(MigrationState *s) > > if (s->state == MIGRATION_STATUS_ACTIVE) { > ret = migration_completion_precopy(s); > - } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { > + } else if (s->state == MIGRATION_STATUS_POSTCOPY_DEVICE || > + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { Exactly. We need to be prepared that src sending too fast so when device loading on dest we finished. > migration_completion_postcopy(s); > } else { > ret = -1; > @@ -3311,8 +3323,8 @@ static MigThrError migration_detect_error(MigrationState *s) > return postcopy_pause(s); > } else { > /* > - * For precopy (or postcopy with error outside IO), we fail > - * with no time. > + * For precopy (or postcopy with error outside IO, or before dest > + * starts), we fail with no time. > */ > migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED); > trace_migration_thread_file_err(); > @@ -3447,7 +3459,8 @@ static MigIterateState migration_iteration_run(MigrationState *s) > { > uint64_t must_precopy, can_postcopy, pending_size; > Error *local_err = NULL; > - bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE; > + bool in_postcopy = (s->state == MIGRATION_STATUS_POSTCOPY_DEVICE || > + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); > bool can_switchover = migration_can_switchover(s); > bool complete_ready; > > diff --git a/migration/savevm.h b/migration/savevm.h > index 2d5e9c7166..c4de0325eb 100644 > --- a/migration/savevm.h > +++ b/migration/savevm.h > @@ -29,6 +29,8 @@ > #define QEMU_VM_COMMAND 0x08 > #define QEMU_VM_SECTION_FOOTER 0x7e > > +#define QEMU_VM_PING_PACKAGED_LOADED 0x42 Only curious how you picked it. :) It's fine, it can also be 5, as we know exactly what we used to use (1-4). > + > bool qemu_savevm_state_blocked(Error **errp); > void qemu_savevm_non_migratable_list(strList **reasons); > int qemu_savevm_state_prepare(Error **errp); > diff --git a/migration/trace-events b/migration/trace-events > index 706db97def..007b5c407e 100644 > --- a/migration/trace-events > +++ b/migration/trace-events > @@ -191,6 +191,7 @@ source_return_path_thread_pong(uint32_t val) "0x%x" > source_return_path_thread_shut(uint32_t val) "0x%x" > source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32 > source_return_path_thread_switchover_acked(void) "" > +source_return_path_thread_dst_started(void) "" > migration_thread_low_pending(uint64_t pending) "%" PRIu64 > migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t bandwidth, uint64_t avail_bw, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " switchover_bw %" PRIu64 " max_size %" PRId64 > process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d" > diff --git a/qapi/migration.json b/qapi/migration.json > index 2387c21e9c..89a20d858d 100644 > --- a/qapi/migration.json > +++ b/qapi/migration.json > @@ -142,6 +142,10 @@ > # @postcopy-active: like active, but now in postcopy mode. > # (since 2.5) > # > +# @postcopy-device: like postcopy-active, but the destination is still > +# loading device state and is not running yet. If migration fails > +# during this state, the source side will resume. (since 10.2) Is it worthwhile to mention we could jump from postcopy-device to completed? I also wonder if it happens if libvirt would get confused. Worth checking with Jiri. Thanks, > +# > # @postcopy-paused: during postcopy but paused. (since 3.0) > # > # @postcopy-recover-setup: setup phase for a postcopy recovery > @@ -173,8 +177,8 @@ > ## > { 'enum': 'MigrationStatus', > 'data': [ 'none', 'setup', 'cancelling', 'cancelled', > - 'active', 'postcopy-active', 'postcopy-paused', > - 'postcopy-recover-setup', > + 'active', 'postcopy-device', 'postcopy-active', > + 'postcopy-paused', 'postcopy-recover-setup', > 'postcopy-recover', 'completed', 'failed', 'colo', > 'pre-switchover', 'device', 'wait-unplug' ] } > ## > diff --git a/tests/qtest/migration/precopy-tests.c b/tests/qtest/migration/precopy-tests.c > index bb38292550..57ca623de5 100644 > --- a/tests/qtest/migration/precopy-tests.c > +++ b/tests/qtest/migration/precopy-tests.c > @@ -1316,13 +1316,14 @@ void migration_test_add_precopy(MigrationTestEnv *env) > } > > /* ensure new status don't go unnoticed */ > - assert(MIGRATION_STATUS__MAX == 15); > + assert(MIGRATION_STATUS__MAX == 16); > > for (int i = MIGRATION_STATUS_NONE; i < MIGRATION_STATUS__MAX; i++) { > switch (i) { > case MIGRATION_STATUS_DEVICE: /* happens too fast */ > case MIGRATION_STATUS_WAIT_UNPLUG: /* no support in tests */ > case MIGRATION_STATUS_COLO: /* no support in tests */ > + case MIGRATION_STATUS_POSTCOPY_DEVICE: /* postcopy can't be cancelled */ > case MIGRATION_STATUS_POSTCOPY_ACTIVE: /* postcopy can't be cancelled */ > case MIGRATION_STATUS_POSTCOPY_PAUSED: > case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: > -- > 2.51.0 > -- Peter Xu
On Fri, Sep 19, 2025 at 12:58:08PM -0400, Peter Xu wrote: > > @@ -2564,6 +2569,11 @@ static void *source_return_path_thread(void *opaque) > > tmp32 = ldl_be_p(buf); > > trace_source_return_path_thread_pong(tmp32); > > qemu_sem_post(&ms->rp_state.rp_pong_acks); > > + if (tmp32 == QEMU_VM_PING_PACKAGED_LOADED) { > > + trace_source_return_path_thread_dst_started(); > > + migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_DEVICE, > > + MIGRATION_STATUS_POSTCOPY_ACTIVE); > > Could this race with the migration thread modifying the state concurrently? > > To avoid it, we could have a bool, set it here once, and in the iterations > do something like: > > diff --git a/migration/migration.c b/migration/migration.c > index 10c216d25d..55230e10ee 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -3449,6 +3449,16 @@ static MigIterateState migration_iteration_run(MigrationState *s) > trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy); > > if (in_postcopy) { > + if (s->postcopy_package_loaded) { > + assert(s->state == MIGRATION_STATUS_POSTCOPY_DEVICE); > + migrate_set_state(s->state, MIGRATION_STATUS_POSTCOPY_DEVICE, > + MIGRATION_STATUS_POSTCOPY_ACTIVE); > + /* > + * Since postcopy cannot be re-initiated, this flag will only > + * be set at most once for QEMU's whole lifecyce. > + */ > + s->postcopy_package_loaded = false; > + } > /* > * Iterate in postcopy until all pending data flushed. Note that > * postcopy completion doesn't rely on can_switchover, because when [...] > > @@ -2871,7 +2882,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) > > > > /* Now, switchover looks all fine, switching to postcopy-active */ > > migrate_set_state(&ms->state, MIGRATION_STATUS_DEVICE, > > - MIGRATION_STATUS_POSTCOPY_ACTIVE); > > + MIGRATION_STATUS_POSTCOPY_DEVICE); > > > > bql_unlock(); > > > > @@ -3035,7 +3046,8 @@ static void migration_completion(MigrationState *s) > > > > if (s->state == MIGRATION_STATUS_ACTIVE) { > > ret = migration_completion_precopy(s); > > - } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { > > + } else if (s->state == MIGRATION_STATUS_POSTCOPY_DEVICE || > > + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { > > Exactly. We need to be prepared that src sending too fast so when device > loading on dest we finished. One thing more to mention here.. which may void some previous comments I left. Let's discuss. I think it may also make sense to only allow a COMPLETE after POSTCOPY_ACTIVE. That is, if src sends too fast to have finished sending everything, reaching COMPLETE during POSTCOPY_DEVICE, that is, while before it receives the new PONG you defined, then.. I _think_ it is better to wait for that. If it finally arrives, then it's perfect, we switch to POSTCOPY_ACTIVE, then continue the completion. If the channel is broken before its arrival, logically we should handle this case as a FAILURE and restart the VM on src. It's only relevant in a corner case, but does that sound better? -- Peter Xu
Hi Peter, On 2025-09-19 13:50, Peter Xu wrote: > On Fri, Sep 19, 2025 at 12:58:08PM -0400, Peter Xu wrote: > > > @@ -2564,6 +2569,11 @@ static void *source_return_path_thread(void *opaque) > > > tmp32 = ldl_be_p(buf); > > > trace_source_return_path_thread_pong(tmp32); > > > qemu_sem_post(&ms->rp_state.rp_pong_acks); > > > + if (tmp32 == QEMU_VM_PING_PACKAGED_LOADED) { > > > + trace_source_return_path_thread_dst_started(); > > > + migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_DEVICE, > > > + MIGRATION_STATUS_POSTCOPY_ACTIVE); > > > > Could this race with the migration thread modifying the state concurrently? > > > > To avoid it, we could have a bool, set it here once, and in the iterations > > do something like: > > > > diff --git a/migration/migration.c b/migration/migration.c > > index 10c216d25d..55230e10ee 100644 > > --- a/migration/migration.c > > +++ b/migration/migration.c > > @@ -3449,6 +3449,16 @@ static MigIterateState migration_iteration_run(MigrationState *s) > > trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy); > > > > if (in_postcopy) { > > + if (s->postcopy_package_loaded) { > > + assert(s->state == MIGRATION_STATUS_POSTCOPY_DEVICE); > > + migrate_set_state(s->state, MIGRATION_STATUS_POSTCOPY_DEVICE, > > + MIGRATION_STATUS_POSTCOPY_ACTIVE); > > + /* > > + * Since postcopy cannot be re-initiated, this flag will only > > + * be set at most once for QEMU's whole lifecyce. > > + */ > > + s->postcopy_package_loaded = false; > > + } > > /* > > * Iterate in postcopy until all pending data flushed. Note that > > * postcopy completion doesn't rely on can_switchover, because when It was there in the RFC version, when there was mutual handshake before dst starting, but I thought cmp&exchange would be enough. I can add it again, however, there is no need to set it to false afterwards, we can simply check if this condition is true: s->postcopy_package_loaded && s->state == MIGRATION_STATUS_POSTCOPY_DEVICE. > > [...] > > > > @@ -2871,7 +2882,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) > > > > > > /* Now, switchover looks all fine, switching to postcopy-active */ > > > migrate_set_state(&ms->state, MIGRATION_STATUS_DEVICE, > > > - MIGRATION_STATUS_POSTCOPY_ACTIVE); > > > + MIGRATION_STATUS_POSTCOPY_DEVICE); > > > > > > bql_unlock(); > > > > > > @@ -3035,7 +3046,8 @@ static void migration_completion(MigrationState *s) > > > > > > if (s->state == MIGRATION_STATUS_ACTIVE) { > > > ret = migration_completion_precopy(s); > > > - } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { > > > + } else if (s->state == MIGRATION_STATUS_POSTCOPY_DEVICE || > > > + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { > > > > Exactly. We need to be prepared that src sending too fast so when device > > loading on dest we finished. > > One thing more to mention here.. which may void some previous comments I > left. Let's discuss. > > I think it may also make sense to only allow a COMPLETE after > POSTCOPY_ACTIVE. > > That is, if src sends too fast to have finished sending everything, > reaching COMPLETE during POSTCOPY_DEVICE, that is, while before it receives > the new PONG you defined, then.. I _think_ it is better to wait for that. > > If it finally arrives, then it's perfect, we switch to POSTCOPY_ACTIVE, > then continue the completion. > > If the channel is broken before its arrival, logically we should handle > this case as a FAILURE and restart the VM on src. > > It's only relevant in a corner case, but does that sound better? Yes, it does make sense to wait for POSTCOPY_ACTIVE as src could finish before dst finished package load and could still fail. We could use a qemu_event_wait() to wait and set the event in src return path thread when the PONG is received. > > -- > Peter Xu >
On Mon, Sep 22, 2025 at 03:34:19PM +0200, Juraj Marcin wrote: > Hi Peter, > > On 2025-09-19 13:50, Peter Xu wrote: > > On Fri, Sep 19, 2025 at 12:58:08PM -0400, Peter Xu wrote: > > > > @@ -2564,6 +2569,11 @@ static void *source_return_path_thread(void *opaque) > > > > tmp32 = ldl_be_p(buf); > > > > trace_source_return_path_thread_pong(tmp32); > > > > qemu_sem_post(&ms->rp_state.rp_pong_acks); > > > > + if (tmp32 == QEMU_VM_PING_PACKAGED_LOADED) { > > > > + trace_source_return_path_thread_dst_started(); > > > > + migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_DEVICE, > > > > + MIGRATION_STATUS_POSTCOPY_ACTIVE); > > > > > > Could this race with the migration thread modifying the state concurrently? > > > > > > To avoid it, we could have a bool, set it here once, and in the iterations > > > do something like: > > > > > > diff --git a/migration/migration.c b/migration/migration.c > > > index 10c216d25d..55230e10ee 100644 > > > --- a/migration/migration.c > > > +++ b/migration/migration.c > > > @@ -3449,6 +3449,16 @@ static MigIterateState migration_iteration_run(MigrationState *s) > > > trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy); > > > > > > if (in_postcopy) { > > > + if (s->postcopy_package_loaded) { > > > + assert(s->state == MIGRATION_STATUS_POSTCOPY_DEVICE); > > > + migrate_set_state(s->state, MIGRATION_STATUS_POSTCOPY_DEVICE, > > > + MIGRATION_STATUS_POSTCOPY_ACTIVE); > > > + /* > > > + * Since postcopy cannot be re-initiated, this flag will only > > > + * be set at most once for QEMU's whole lifecyce. > > > + */ > > > + s->postcopy_package_loaded = false; > > > + } > > > /* > > > * Iterate in postcopy until all pending data flushed. Note that > > > * postcopy completion doesn't rely on can_switchover, because when > > It was there in the RFC version, when there was mutual handshake before > dst starting, but I thought cmp&exchange would be enough. I can add it > again, however, there is no need to set it to false afterwards, we can > simply check if this condition is true: > s->postcopy_package_loaded && s->state == MIGRATION_STATUS_POSTCOPY_DEVICE. Setting it to false was a safety measure. Indeed not needed, but when so we need to be extremely careful to always check with above two conditions to avoid it frequently triggers. So I thought clearing it would be much easier to read. I can wait and read the new version whatever you prefer. > > > > > [...] > > > > > > @@ -2871,7 +2882,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) > > > > > > > > /* Now, switchover looks all fine, switching to postcopy-active */ > > > > migrate_set_state(&ms->state, MIGRATION_STATUS_DEVICE, > > > > - MIGRATION_STATUS_POSTCOPY_ACTIVE); > > > > + MIGRATION_STATUS_POSTCOPY_DEVICE); > > > > > > > > bql_unlock(); > > > > > > > > @@ -3035,7 +3046,8 @@ static void migration_completion(MigrationState *s) > > > > > > > > if (s->state == MIGRATION_STATUS_ACTIVE) { > > > > ret = migration_completion_precopy(s); > > > > - } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { > > > > + } else if (s->state == MIGRATION_STATUS_POSTCOPY_DEVICE || > > > > + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { > > > > > > Exactly. We need to be prepared that src sending too fast so when device > > > loading on dest we finished. > > > > One thing more to mention here.. which may void some previous comments I > > left. Let's discuss. > > > > I think it may also make sense to only allow a COMPLETE after > > POSTCOPY_ACTIVE. > > > > That is, if src sends too fast to have finished sending everything, > > reaching COMPLETE during POSTCOPY_DEVICE, that is, while before it receives > > the new PONG you defined, then.. I _think_ it is better to wait for that. > > > > If it finally arrives, then it's perfect, we switch to POSTCOPY_ACTIVE, > > then continue the completion. > > > > If the channel is broken before its arrival, logically we should handle > > this case as a FAILURE and restart the VM on src. > > > > It's only relevant in a corner case, but does that sound better? > > Yes, it does make sense to wait for POSTCOPY_ACTIVE as src could finish > before dst finished package load and could still fail. We could use a > qemu_event_wait() to wait and set the event in src return path thread > when the PONG is received. Right. Though since we want to move to POSTCOPY_ACTIVE asap when receiving the corresponding PONG, we may want to have something like qemu_event_read() just return "ev->value == EV_SET", as we can't event_wait() in the migration thread while pushing RAMs. Or, to avoid touching library code, we can also introduce yet another bool, then when receiving the PONG we set both (1) the bool, and (2) the event. We can check the bool in the iterations (when set, wait() to consume the event; the event should have happened or just to happen), and wait() on the event when completing (when return, bool must have been set, hence can reset bool). -- Peter Xu
On 2025-09-22 12:16, Peter Xu wrote: > On Mon, Sep 22, 2025 at 03:34:19PM +0200, Juraj Marcin wrote: > > Hi Peter, > > > > On 2025-09-19 13:50, Peter Xu wrote: > > > On Fri, Sep 19, 2025 at 12:58:08PM -0400, Peter Xu wrote: > > > > > @@ -2564,6 +2569,11 @@ static void *source_return_path_thread(void *opaque) > > > > > tmp32 = ldl_be_p(buf); > > > > > trace_source_return_path_thread_pong(tmp32); > > > > > qemu_sem_post(&ms->rp_state.rp_pong_acks); > > > > > + if (tmp32 == QEMU_VM_PING_PACKAGED_LOADED) { > > > > > + trace_source_return_path_thread_dst_started(); > > > > > + migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_DEVICE, > > > > > + MIGRATION_STATUS_POSTCOPY_ACTIVE); > > > > > > > > Could this race with the migration thread modifying the state concurrently? > > > > > > > > To avoid it, we could have a bool, set it here once, and in the iterations > > > > do something like: > > > > > > > > diff --git a/migration/migration.c b/migration/migration.c > > > > index 10c216d25d..55230e10ee 100644 > > > > --- a/migration/migration.c > > > > +++ b/migration/migration.c > > > > @@ -3449,6 +3449,16 @@ static MigIterateState migration_iteration_run(MigrationState *s) > > > > trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy); > > > > > > > > if (in_postcopy) { > > > > + if (s->postcopy_package_loaded) { > > > > + assert(s->state == MIGRATION_STATUS_POSTCOPY_DEVICE); > > > > + migrate_set_state(s->state, MIGRATION_STATUS_POSTCOPY_DEVICE, > > > > + MIGRATION_STATUS_POSTCOPY_ACTIVE); > > > > + /* > > > > + * Since postcopy cannot be re-initiated, this flag will only > > > > + * be set at most once for QEMU's whole lifecyce. > > > > + */ > > > > + s->postcopy_package_loaded = false; > > > > + } > > > > /* > > > > * Iterate in postcopy until all pending data flushed. Note that > > > > * postcopy completion doesn't rely on can_switchover, because when > > > > It was there in the RFC version, when there was mutual handshake before > > dst starting, but I thought cmp&exchange would be enough. I can add it > > again, however, there is no need to set it to false afterwards, we can > > simply check if this condition is true: > > s->postcopy_package_loaded && s->state == MIGRATION_STATUS_POSTCOPY_DEVICE. > > Setting it to false was a safety measure. Indeed not needed, but when so > we need to be extremely careful to always check with above two conditions > to avoid it frequently triggers. So I thought clearing it would be much > easier to read. I can wait and read the new version whatever you prefer. > > > > > > > > > [...] > > > > > > > > @@ -2871,7 +2882,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) > > > > > > > > > > /* Now, switchover looks all fine, switching to postcopy-active */ > > > > > migrate_set_state(&ms->state, MIGRATION_STATUS_DEVICE, > > > > > - MIGRATION_STATUS_POSTCOPY_ACTIVE); > > > > > + MIGRATION_STATUS_POSTCOPY_DEVICE); > > > > > > > > > > bql_unlock(); > > > > > > > > > > @@ -3035,7 +3046,8 @@ static void migration_completion(MigrationState *s) > > > > > > > > > > if (s->state == MIGRATION_STATUS_ACTIVE) { > > > > > ret = migration_completion_precopy(s); > > > > > - } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { > > > > > + } else if (s->state == MIGRATION_STATUS_POSTCOPY_DEVICE || > > > > > + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { > > > > > > > > Exactly. We need to be prepared that src sending too fast so when device > > > > loading on dest we finished. > > > > > > One thing more to mention here.. which may void some previous comments I > > > left. Let's discuss. > > > > > > I think it may also make sense to only allow a COMPLETE after > > > POSTCOPY_ACTIVE. > > > > > > That is, if src sends too fast to have finished sending everything, > > > reaching COMPLETE during POSTCOPY_DEVICE, that is, while before it receives > > > the new PONG you defined, then.. I _think_ it is better to wait for that. > > > > > > If it finally arrives, then it's perfect, we switch to POSTCOPY_ACTIVE, > > > then continue the completion. > > > > > > If the channel is broken before its arrival, logically we should handle > > > this case as a FAILURE and restart the VM on src. > > > > > > It's only relevant in a corner case, but does that sound better? > > > > Yes, it does make sense to wait for POSTCOPY_ACTIVE as src could finish > > before dst finished package load and could still fail. We could use a > > qemu_event_wait() to wait and set the event in src return path thread > > when the PONG is received. > > Right. > > Though since we want to move to POSTCOPY_ACTIVE asap when receiving the > corresponding PONG, we may want to have something like qemu_event_read() > just return "ev->value == EV_SET", as we can't event_wait() in the > migration thread while pushing RAMs. > > Or, to avoid touching library code, we can also introduce yet another bool, > then when receiving the PONG we set both (1) the bool, and (2) the event. > We can check the bool in the iterations (when set, wait() to consume the > event; the event should have happened or just to happen), and wait() on the > event when completing (when return, bool must have been set, hence can > reset bool). Yes, my initial idea was the latter with two new attributes a bool and an event, definitely not waiting while there are still pages to send. But I can also evaluate the first one before sending another version. Thanks! > > -- > Peter Xu >
© 2016 - 2025 Red Hat, Inc.