From: Steve Sistare <steven.sistare@oracle.com>
When migration for cpr is initiated, stop the vm and set state
RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the
possibility of ram and device state being out of sync, and guarantees
that a guest in the suspended state remains suspended, because qmp_cont
rejects a cont command in the RUN_STATE_FINISH_MIGRATE state.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1708622920-68779-11-git-send-email-steven.sistare@oracle.com
Signed-off-by: Peter Xu <peterx@redhat.com>
---
include/migration/misc.h | 1 +
migration/migration.h | 2 --
migration/migration.c | 51 ++++++++++++++++++++++++----------------
3 files changed, 32 insertions(+), 22 deletions(-)
diff --git a/include/migration/misc.h b/include/migration/misc.h
index e4933b815b..5d1aa593ed 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -60,6 +60,7 @@ void migration_object_init(void);
void migration_shutdown(void);
bool migration_is_idle(void);
bool migration_is_active(MigrationState *);
+bool migrate_mode_is_cpr(MigrationState *);
typedef enum MigrationEventType {
MIG_EVENT_PRECOPY_SETUP,
diff --git a/migration/migration.h b/migration/migration.h
index aef8afbe1f..65c0b61cbd 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s);
*/
void migration_rp_kick(MigrationState *s);
-int migration_stop_vm(RunState state);
-
#endif
diff --git a/migration/migration.c b/migration/migration.c
index 37c836b0b0..90a90947fb 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -167,11 +167,19 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
return (a > b) - (a < b);
}
-int migration_stop_vm(RunState state)
+static int migration_stop_vm(MigrationState *s, RunState state)
{
- int ret = vm_stop_force_state(state);
+ int ret;
+
+ migration_downtime_start(s);
+
+ s->vm_old_state = runstate_get();
+ global_state_store();
+
+ ret = vm_stop_force_state(state);
trace_vmstate_downtime_checkpoint("src-vm-stopped");
+ trace_migration_completion_vm_stop(ret);
return ret;
}
@@ -1602,6 +1610,11 @@ bool migration_is_active(MigrationState *s)
s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
}
+bool migrate_mode_is_cpr(MigrationState *s)
+{
+ return s->parameters.mode == MIG_MODE_CPR_REBOOT;
+}
+
int migrate_init(MigrationState *s, Error **errp)
{
int ret;
@@ -2454,10 +2467,7 @@ static int postcopy_start(MigrationState *ms, Error **errp)
bql_lock();
trace_postcopy_start_set_run();
- migration_downtime_start(ms);
-
- global_state_store();
- ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
+ ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE);
if (ret < 0) {
goto fail;
}
@@ -2652,15 +2662,12 @@ static int migration_completion_precopy(MigrationState *s,
int ret;
bql_lock();
- migration_downtime_start(s);
-
- s->vm_old_state = runstate_get();
- global_state_store();
- ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
- trace_migration_completion_vm_stop(ret);
- if (ret < 0) {
- goto out_unlock;
+ if (!migrate_mode_is_cpr(s)) {
+ ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
+ if (ret < 0) {
+ goto out_unlock;
+ }
}
ret = migration_maybe_pause(s, current_active_state,
@@ -3500,15 +3507,10 @@ static void *bg_migration_thread(void *opaque)
s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
trace_migration_thread_setup_complete();
- migration_downtime_start(s);
bql_lock();
- s->vm_old_state = runstate_get();
-
- global_state_store();
- /* Forcibly stop VM before saving state of vCPUs and devices */
- if (migration_stop_vm(RUN_STATE_PAUSED)) {
+ if (migration_stop_vm(s, RUN_STATE_PAUSED)) {
goto fail;
}
/*
@@ -3584,6 +3586,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
Error *local_err = NULL;
uint64_t rate_limit;
bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
+ int ret;
/*
* If there's a previous error, free it and prepare for another one.
@@ -3655,6 +3658,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
return;
}
+ if (migrate_mode_is_cpr(s)) {
+ ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
+ if (ret < 0) {
+ error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
+ goto fail;
+ }
+ }
+
if (migrate_background_snapshot()) {
qemu_thread_create(&s->thread, "bg_snapshot",
bg_migration_thread, s, QEMU_THREAD_JOINABLE);
--
2.43.0
Hi Steve, We found that this specific change in particular ("migration: stop vm for cpr") fixes a bug that we've identified in testing back-to-back live updates in a lab environment. More specifically, *without* this change (which is not available in 8.2.2, but *is* available in 9.0.0) causes the metadata save file to be corrupted when doing live updates one after another. Typically we see a corrupted save file somewhere in between 20 and 30 live updates and while doing a git bisect, we found that this change makes the problem go away. Were you aware? Is there any plan in place to cherry pick this for 8.2.3, perhaps or a plan to release 8.2.3 at some point? Here are some examples of how the bug manifests in different locations of the QEMU metadata save file: 2024-04-26T13:28:53Z qemu-system-x86_64: Failed to load mtrr_var:base 2024-04-26T13:28:53Z qemu-system-x86_64: Failed to load cpu:env.mtrr_var 2024-04-26T13:28:53Z qemu-system-x86_64: error while loading state for instance 0x1b of device 'cpu' 2024-04-26T13:28:53Z qemu-system-x86_64: load of migration failed: Input/output error And another: 2024-04-17T16:09:47Z qemu-system-x86_64: check_section_footer: Read section footer failed: -5 2024-04-17T16:09:47Z qemu-system-x86_64: load of migration failed: Invalid argument And another: 2024-04-30T21:53:29Z qemu-system-x86_64: Unable to read ID string for section 163 2024-04-30T21:53:29Z qemu-system-x86_64: load of migration failed: Invalid argument And another: 2024-05-01T16:01:44Z qemu-system-x86_64: Unable to read ID string for section 164 2024-05-01T16:01:44Z qemu-system-x86_64: load of migration failed: Invalid argument As you can see, they occur quite randomly, but generally it takes at least 20-30+ live updates before the problem occurs. - Michael On 2/27/24 23:13, peterx@redhat.com wrote: > From: Steve Sistare<steven.sistare@oracle.com> > > When migration for cpr is initiated, stop the vm and set state > RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the > possibility of ram and device state being out of sync, and guarantees > that a guest in the suspended state remains suspended, because qmp_cont > rejects a cont command in the RUN_STATE_FINISH_MIGRATE state. > > Signed-off-by: Steve Sistare<steven.sistare@oracle.com> > Reviewed-by: Peter Xu<peterx@redhat.com> > Link:https://urldefense.com/v3/__https://lore.kernel.org/r/1708622920-68779-11-git-send-email-steven.sistare@oracle.com__;!!GjvTz_vk!QLsFOCX-x2U9bzAo98SdidKlomHrmf_t0UmQKtgudoIcaDVoAJOPm39ZqaNP_nT5I8QqVfSgwhDZmg$ > Signed-off-by: Peter Xu<peterx@redhat.com> > --- > include/migration/misc.h | 1 + > migration/migration.h | 2 -- > migration/migration.c | 51 ++++++++++++++++++++++++---------------- > 3 files changed, 32 insertions(+), 22 deletions(-) > > diff --git a/include/migration/misc.h b/include/migration/misc.h > index e4933b815b..5d1aa593ed 100644 > --- a/include/migration/misc.h > +++ b/include/migration/misc.h > @@ -60,6 +60,7 @@ void migration_object_init(void); > void migration_shutdown(void); > bool migration_is_idle(void); > bool migration_is_active(MigrationState *); > +bool migrate_mode_is_cpr(MigrationState *); > > typedef enum MigrationEventType { > MIG_EVENT_PRECOPY_SETUP, > diff --git a/migration/migration.h b/migration/migration.h > index aef8afbe1f..65c0b61cbd 100644 > --- a/migration/migration.h > +++ b/migration/migration.h > @@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s); > */ > void migration_rp_kick(MigrationState *s); > > -int migration_stop_vm(RunState state); > - > #endif > diff --git a/migration/migration.c b/migration/migration.c > index 37c836b0b0..90a90947fb 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -167,11 +167,19 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp) > return (a > b) - (a < b); > } > > -int migration_stop_vm(RunState state) > +static int migration_stop_vm(MigrationState *s, RunState state) > { > - int ret = vm_stop_force_state(state); > + int ret; > + > + migration_downtime_start(s); > + > + s->vm_old_state = runstate_get(); > + global_state_store(); > + > + ret = vm_stop_force_state(state); > > trace_vmstate_downtime_checkpoint("src-vm-stopped"); > + trace_migration_completion_vm_stop(ret); > > return ret; > } > @@ -1602,6 +1610,11 @@ bool migration_is_active(MigrationState *s) > s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); > } > > +bool migrate_mode_is_cpr(MigrationState *s) > +{ > + return s->parameters.mode == MIG_MODE_CPR_REBOOT; > +} > + > int migrate_init(MigrationState *s, Error **errp) > { > int ret; > @@ -2454,10 +2467,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) > bql_lock(); > trace_postcopy_start_set_run(); > > - migration_downtime_start(ms); > - > - global_state_store(); > - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > + ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE); > if (ret < 0) { > goto fail; > } > @@ -2652,15 +2662,12 @@ static int migration_completion_precopy(MigrationState *s, > int ret; > > bql_lock(); > - migration_downtime_start(s); > - > - s->vm_old_state = runstate_get(); > - global_state_store(); > > - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > - trace_migration_completion_vm_stop(ret); > - if (ret < 0) { > - goto out_unlock; > + if (!migrate_mode_is_cpr(s)) { > + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); > + if (ret < 0) { > + goto out_unlock; > + } > } > > ret = migration_maybe_pause(s, current_active_state, > @@ -3500,15 +3507,10 @@ static void *bg_migration_thread(void *opaque) > s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; > > trace_migration_thread_setup_complete(); > - migration_downtime_start(s); > > bql_lock(); > > - s->vm_old_state = runstate_get(); > - > - global_state_store(); > - /* Forcibly stop VM before saving state of vCPUs and devices */ > - if (migration_stop_vm(RUN_STATE_PAUSED)) { > + if (migration_stop_vm(s, RUN_STATE_PAUSED)) { > goto fail; > } > /* > @@ -3584,6 +3586,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) > Error *local_err = NULL; > uint64_t rate_limit; > bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; > + int ret; > > /* > * If there's a previous error, free it and prepare for another one. > @@ -3655,6 +3658,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) > return; > } > > + if (migrate_mode_is_cpr(s)) { > + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); > + if (ret < 0) { > + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); > + goto fail; > + } > + } > + > if (migrate_background_snapshot()) { > qemu_thread_create(&s->thread, "bg_snapshot", > bg_migration_thread, s, QEMU_THREAD_JOINABLE);
Hi Michael, No surprise here, I did see some of the same failure messages and they prompted me to submit the fix. They are all symptoms of "the possibility of ram and device state being out of sync" as mentioned in the commit. I am not familiar with the process for maintaining old releases for qemu. Perhaps someone on this list can comment on 8.2.3. - Steve On 5/13/2024 2:22 PM, Michael Galaxy wrote: > Hi Steve, > > We found that this specific change in particular ("migration: stop vm for cpr") > fixes a bug that we've identified in testing back-to-back live updates in a lab > environment. > > More specifically, *without* this change (which is not available in 8.2.2, but > *is* available in 9.0.0) causes the metadata save file to be corrupted when > doing live updates one after another. Typically we see a corrupted save file > somewhere in between 20 and 30 live updates and while doing a git bisect, we > found that this change makes the problem go away. > > Were you aware? Is there any plan in place to cherry pick this for 8.2.3, > perhaps or a plan to release 8.2.3 at some point? > > Here are some examples of how the bug manifests in different locations of the > QEMU metadata save file: > > 2024-04-26T13:28:53Z qemu-system-x86_64: Failed to load mtrr_var:base > 2024-04-26T13:28:53Z qemu-system-x86_64: Failed to load cpu:env.mtrr_var > 2024-04-26T13:28:53Z qemu-system-x86_64: error while loading state for instance 0x1b of device 'cpu' > 2024-04-26T13:28:53Z qemu-system-x86_64: load of migration failed: Input/output error > > And another: > > 2024-04-17T16:09:47Z qemu-system-x86_64: check_section_footer: Read section footer failed: -5 > 2024-04-17T16:09:47Z qemu-system-x86_64: load of migration failed: Invalid argument > > And another: > > 2024-04-30T21:53:29Z qemu-system-x86_64: Unable to read ID string for section 163 > 2024-04-30T21:53:29Z qemu-system-x86_64: load of migration failed: Invalid argument > > And another: > > 2024-05-01T16:01:44Z qemu-system-x86_64: Unable to read ID string for section 164 > 2024-05-01T16:01:44Z qemu-system-x86_64: load of migration failed: Invalid argument > > > As you can see, they occur quite randomly, but generally it takes at least > 20-30+ live updates before the problem occurs. > > - Michael > > On 2/27/24 23:13, peterx@redhat.com wrote: >> From: Steve Sistare<steven.sistare@oracle.com> >> >> When migration for cpr is initiated, stop the vm and set state >> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the >> possibility of ram and device state being out of sync, and guarantees >> that a guest in the suspended state remains suspended, because qmp_cont >> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state. >> >> Signed-off-by: Steve Sistare<steven.sistare@oracle.com> >> Reviewed-by: Peter Xu<peterx@redhat.com> >> Link:https://urldefense.com/v3/__https://lore.kernel.org/r/1708622920-68779-11-git-send-email-steven.sistare@oracle.com__;!!GjvTz_vk!QLsFOCX-x2U9bzAo98SdidKlomHrmf_t0UmQKtgudoIcaDVoAJOPm39ZqaNP_nT5I8QqVfSgwhDZmg$ >> Signed-off-by: Peter Xu<peterx@redhat.com> >> --- >> include/migration/misc.h | 1 + >> migration/migration.h | 2 -- >> migration/migration.c | 51 ++++++++++++++++++++++++---------------- >> 3 files changed, 32 insertions(+), 22 deletions(-) >> >> diff --git a/include/migration/misc.h b/include/migration/misc.h >> index e4933b815b..5d1aa593ed 100644 >> --- a/include/migration/misc.h >> +++ b/include/migration/misc.h >> @@ -60,6 +60,7 @@ void migration_object_init(void); >> void migration_shutdown(void); >> bool migration_is_idle(void); >> bool migration_is_active(MigrationState *); >> +bool migrate_mode_is_cpr(MigrationState *); >> >> typedef enum MigrationEventType { >> MIG_EVENT_PRECOPY_SETUP, >> diff --git a/migration/migration.h b/migration/migration.h >> index aef8afbe1f..65c0b61cbd 100644 >> --- a/migration/migration.h >> +++ b/migration/migration.h >> @@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s); >> */ >> void migration_rp_kick(MigrationState *s); >> >> -int migration_stop_vm(RunState state); >> - >> #endif >> diff --git a/migration/migration.c b/migration/migration.c >> index 37c836b0b0..90a90947fb 100644 >> --- a/migration/migration.c >> +++ b/migration/migration.c >> @@ -167,11 +167,19 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp) >> return (a > b) - (a < b); >> } >> >> -int migration_stop_vm(RunState state) >> +static int migration_stop_vm(MigrationState *s, RunState state) >> { >> - int ret = vm_stop_force_state(state); >> + int ret; >> + >> + migration_downtime_start(s); >> + >> + s->vm_old_state = runstate_get(); >> + global_state_store(); >> + >> + ret = vm_stop_force_state(state); >> >> trace_vmstate_downtime_checkpoint("src-vm-stopped"); >> + trace_migration_completion_vm_stop(ret); >> >> return ret; >> } >> @@ -1602,6 +1610,11 @@ bool migration_is_active(MigrationState *s) >> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); >> } >> >> +bool migrate_mode_is_cpr(MigrationState *s) >> +{ >> + return s->parameters.mode == MIG_MODE_CPR_REBOOT; >> +} >> + >> int migrate_init(MigrationState *s, Error **errp) >> { >> int ret; >> @@ -2454,10 +2467,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) >> bql_lock(); >> trace_postcopy_start_set_run(); >> >> - migration_downtime_start(ms); >> - >> - global_state_store(); >> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >> + ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE); >> if (ret < 0) { >> goto fail; >> } >> @@ -2652,15 +2662,12 @@ static int migration_completion_precopy(MigrationState *s, >> int ret; >> >> bql_lock(); >> - migration_downtime_start(s); >> - >> - s->vm_old_state = runstate_get(); >> - global_state_store(); >> >> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >> - trace_migration_completion_vm_stop(ret); >> - if (ret < 0) { >> - goto out_unlock; >> + if (!migrate_mode_is_cpr(s)) { >> + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); >> + if (ret < 0) { >> + goto out_unlock; >> + } >> } >> >> ret = migration_maybe_pause(s, current_active_state, >> @@ -3500,15 +3507,10 @@ static void *bg_migration_thread(void *opaque) >> s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; >> >> trace_migration_thread_setup_complete(); >> - migration_downtime_start(s); >> >> bql_lock(); >> >> - s->vm_old_state = runstate_get(); >> - >> - global_state_store(); >> - /* Forcibly stop VM before saving state of vCPUs and devices */ >> - if (migration_stop_vm(RUN_STATE_PAUSED)) { >> + if (migration_stop_vm(s, RUN_STATE_PAUSED)) { >> goto fail; >> } >> /* >> @@ -3584,6 +3586,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) >> Error *local_err = NULL; >> uint64_t rate_limit; >> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; >> + int ret; >> >> /* >> * If there's a previous error, free it and prepare for another one. >> @@ -3655,6 +3658,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) >> return; >> } >> >> + if (migrate_mode_is_cpr(s)) { >> + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); >> + if (ret < 0) { >> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); >> + goto fail; >> + } >> + } >> + >> if (migrate_background_snapshot()) { >> qemu_thread_create(&s->thread, "bg_snapshot", >> bg_migration_thread, s, QEMU_THREAD_JOINABLE);
Hi Steve, Thanks for the response. It looks like literally *just today* 8.2.4 was released. I'll go check it out. - Michael On 5/13/24 15:10, Steven Sistare wrote: > Hi Michael, > No surprise here, I did see some of the same failure messages and they > prompted me to submit the fix. They are all symptoms of "the > possibility of > ram and device state being out of sync" as mentioned in the commit. > > I am not familiar with the process for maintaining old releases for qemu. > Perhaps someone on this list can comment on 8.2.3. > > - Steve > > On 5/13/2024 2:22 PM, Michael Galaxy wrote: >> Hi Steve, >> >> We found that this specific change in particular ("migration: stop vm >> for cpr") fixes a bug that we've identified in testing back-to-back >> live updates in a lab environment. >> >> More specifically, *without* this change (which is not available in >> 8.2.2, but *is* available in 9.0.0) causes the metadata save file to >> be corrupted when doing live updates one after another. Typically we >> see a corrupted save file somewhere in between 20 and 30 live updates >> and while doing a git bisect, we found that this change makes the >> problem go away. >> >> Were you aware? Is there any plan in place to cherry pick this for >> 8.2.3, perhaps or a plan to release 8.2.3 at some point? >> >> Here are some examples of how the bug manifests in different >> locations of the QEMU metadata save file: >> >> 2024-04-26T13:28:53Z qemu-system-x86_64: Failed to load mtrr_var:base >> 2024-04-26T13:28:53Z qemu-system-x86_64: Failed to load cpu:env.mtrr_var >> 2024-04-26T13:28:53Z qemu-system-x86_64: error while loading state >> for instance 0x1b of device 'cpu' >> 2024-04-26T13:28:53Z qemu-system-x86_64: load of migration failed: >> Input/output error >> >> And another: >> >> 2024-04-17T16:09:47Z qemu-system-x86_64: check_section_footer: Read >> section footer failed: -5 >> 2024-04-17T16:09:47Z qemu-system-x86_64: load of migration failed: >> Invalid argument >> >> And another: >> >> 2024-04-30T21:53:29Z qemu-system-x86_64: Unable to read ID string for >> section 163 >> 2024-04-30T21:53:29Z qemu-system-x86_64: load of migration failed: >> Invalid argument >> >> And another: >> >> 2024-05-01T16:01:44Z qemu-system-x86_64: Unable to read ID string for >> section 164 >> 2024-05-01T16:01:44Z qemu-system-x86_64: load of migration failed: >> Invalid argument >> >> As you can see, they occur quite randomly, but generally it takes at >> least 20-30+ live updates before the problem occurs. >> >> - Michael >> >> On 2/27/24 23:13, peterx@redhat.com wrote: >>> From: Steve Sistare<steven.sistare@oracle.com> >>> >>> When migration for cpr is initiated, stop the vm and set state >>> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the >>> possibility of ram and device state being out of sync, and guarantees >>> that a guest in the suspended state remains suspended, because qmp_cont >>> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state. >>> >>> Signed-off-by: Steve Sistare<steven.sistare@oracle.com> >>> Reviewed-by: Peter Xu<peterx@redhat.com> >>> Link:https://urldefense.com/v3/__https://lore.kernel.org/r/1708622920-68779-11-git-send-email-steven.sistare@oracle.com__;!!GjvTz_vk!QLsFOCX-x2U9bzAo98SdidKlomHrmf_t0UmQKtgudoIcaDVoAJOPm39ZqaNP_nT5I8QqVfSgwhDZmg$ >>> Signed-off-by: Peter Xu<peterx@redhat.com> >>> --- >>> include/migration/misc.h | 1 + >>> migration/migration.h | 2 -- >>> migration/migration.c | 51 >>> ++++++++++++++++++++++++---------------- >>> 3 files changed, 32 insertions(+), 22 deletions(-) >>> >>> diff --git a/include/migration/misc.h b/include/migration/misc.h >>> index e4933b815b..5d1aa593ed 100644 >>> --- a/include/migration/misc.h >>> +++ b/include/migration/misc.h >>> @@ -60,6 +60,7 @@ void migration_object_init(void); >>> void migration_shutdown(void); >>> bool migration_is_idle(void); >>> bool migration_is_active(MigrationState *); >>> +bool migrate_mode_is_cpr(MigrationState *); >>> typedef enum MigrationEventType { >>> MIG_EVENT_PRECOPY_SETUP, >>> diff --git a/migration/migration.h b/migration/migration.h >>> index aef8afbe1f..65c0b61cbd 100644 >>> --- a/migration/migration.h >>> +++ b/migration/migration.h >>> @@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s); >>> */ >>> void migration_rp_kick(MigrationState *s); >>> -int migration_stop_vm(RunState state); >>> - >>> #endif >>> diff --git a/migration/migration.c b/migration/migration.c >>> index 37c836b0b0..90a90947fb 100644 >>> --- a/migration/migration.c >>> +++ b/migration/migration.c >>> @@ -167,11 +167,19 @@ static gint >>> page_request_addr_cmp(gconstpointer ap, gconstpointer bp) >>> return (a > b) - (a < b); >>> } >>> -int migration_stop_vm(RunState state) >>> +static int migration_stop_vm(MigrationState *s, RunState state) >>> { >>> - int ret = vm_stop_force_state(state); >>> + int ret; >>> + >>> + migration_downtime_start(s); >>> + >>> + s->vm_old_state = runstate_get(); >>> + global_state_store(); >>> + >>> + ret = vm_stop_force_state(state); >>> trace_vmstate_downtime_checkpoint("src-vm-stopped"); >>> + trace_migration_completion_vm_stop(ret); >>> return ret; >>> } >>> @@ -1602,6 +1610,11 @@ bool migration_is_active(MigrationState *s) >>> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); >>> } >>> +bool migrate_mode_is_cpr(MigrationState *s) >>> +{ >>> + return s->parameters.mode == MIG_MODE_CPR_REBOOT; >>> +} >>> + >>> int migrate_init(MigrationState *s, Error **errp) >>> { >>> int ret; >>> @@ -2454,10 +2467,7 @@ static int postcopy_start(MigrationState *ms, >>> Error **errp) >>> bql_lock(); >>> trace_postcopy_start_set_run(); >>> - migration_downtime_start(ms); >>> - >>> - global_state_store(); >>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >>> + ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE); >>> if (ret < 0) { >>> goto fail; >>> } >>> @@ -2652,15 +2662,12 @@ static int >>> migration_completion_precopy(MigrationState *s, >>> int ret; >>> bql_lock(); >>> - migration_downtime_start(s); >>> - >>> - s->vm_old_state = runstate_get(); >>> - global_state_store(); >>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >>> - trace_migration_completion_vm_stop(ret); >>> - if (ret < 0) { >>> - goto out_unlock; >>> + if (!migrate_mode_is_cpr(s)) { >>> + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); >>> + if (ret < 0) { >>> + goto out_unlock; >>> + } >>> } >>> ret = migration_maybe_pause(s, current_active_state, >>> @@ -3500,15 +3507,10 @@ static void *bg_migration_thread(void *opaque) >>> s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; >>> trace_migration_thread_setup_complete(); >>> - migration_downtime_start(s); >>> bql_lock(); >>> - s->vm_old_state = runstate_get(); >>> - >>> - global_state_store(); >>> - /* Forcibly stop VM before saving state of vCPUs and devices */ >>> - if (migration_stop_vm(RUN_STATE_PAUSED)) { >>> + if (migration_stop_vm(s, RUN_STATE_PAUSED)) { >>> goto fail; >>> } >>> /* >>> @@ -3584,6 +3586,7 @@ void migrate_fd_connect(MigrationState *s, >>> Error *error_in) >>> Error *local_err = NULL; >>> uint64_t rate_limit; >>> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; >>> + int ret; >>> /* >>> * If there's a previous error, free it and prepare for >>> another one. >>> @@ -3655,6 +3658,14 @@ void migrate_fd_connect(MigrationState *s, >>> Error *error_in) >>> return; >>> } >>> + if (migrate_mode_is_cpr(s)) { >>> + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); >>> + if (ret < 0) { >>> + error_setg(&local_err, "migration_stop_vm failed, error >>> %d", -ret); >>> + goto fail; >>> + } >>> + } >>> + >>> if (migrate_background_snapshot()) { >>> qemu_thread_create(&s->thread, "bg_snapshot", >>> bg_migration_thread, s, QEMU_THREAD_JOINABLE);
Steve, OK, so it does not look like this bugfix you wrote was included in 8.2.4 (which was released yesterday). Unfortunately, that means that anyone using CPR in that release will still (eventually) encounter the bug like I did. I would recommend that y'all consider cherry-picking, perhaps, the relevant commits for a possible 8.2.5 ? - Michael On 5/13/24 20:15, Michael Galaxy wrote: > Hi Steve, > > Thanks for the response. > > It looks like literally *just today* 8.2.4 was released. I'll go check > it out. > > - Michael > > On 5/13/24 15:10, Steven Sistare wrote: >> Hi Michael, >> No surprise here, I did see some of the same failure messages and they >> prompted me to submit the fix. They are all symptoms of "the >> possibility of >> ram and device state being out of sync" as mentioned in the commit. >> >> I am not familiar with the process for maintaining old releases for >> qemu. >> Perhaps someone on this list can comment on 8.2.3. >> >> - Steve >> >> On 5/13/2024 2:22 PM, Michael Galaxy wrote: >>> Hi Steve, >>> >>> We found that this specific change in particular ("migration: stop >>> vm for cpr") fixes a bug that we've identified in testing >>> back-to-back live updates in a lab environment. >>> >>> More specifically, *without* this change (which is not available in >>> 8.2.2, but *is* available in 9.0.0) causes the metadata save file to >>> be corrupted when doing live updates one after another. Typically we >>> see a corrupted save file somewhere in between 20 and 30 live >>> updates and while doing a git bisect, we found that this change >>> makes the problem go away. >>> >>> Were you aware? Is there any plan in place to cherry pick this for >>> 8.2.3, perhaps or a plan to release 8.2.3 at some point? >>> >>> Here are some examples of how the bug manifests in different >>> locations of the QEMU metadata save file: >>> >>> 2024-04-26T13:28:53Z qemu-system-x86_64: Failed to load mtrr_var:base >>> 2024-04-26T13:28:53Z qemu-system-x86_64: Failed to load >>> cpu:env.mtrr_var >>> 2024-04-26T13:28:53Z qemu-system-x86_64: error while loading state >>> for instance 0x1b of device 'cpu' >>> 2024-04-26T13:28:53Z qemu-system-x86_64: load of migration failed: >>> Input/output error >>> >>> And another: >>> >>> 2024-04-17T16:09:47Z qemu-system-x86_64: check_section_footer: Read >>> section footer failed: -5 >>> 2024-04-17T16:09:47Z qemu-system-x86_64: load of migration failed: >>> Invalid argument >>> >>> And another: >>> >>> 2024-04-30T21:53:29Z qemu-system-x86_64: Unable to read ID string >>> for section 163 >>> 2024-04-30T21:53:29Z qemu-system-x86_64: load of migration failed: >>> Invalid argument >>> >>> And another: >>> >>> 2024-05-01T16:01:44Z qemu-system-x86_64: Unable to read ID string >>> for section 164 >>> 2024-05-01T16:01:44Z qemu-system-x86_64: load of migration failed: >>> Invalid argument >>> >>> As you can see, they occur quite randomly, but generally it takes at >>> least 20-30+ live updates before the problem occurs. >>> >>> - Michael >>> >>> On 2/27/24 23:13, peterx@redhat.com wrote: >>>> From: Steve Sistare<steven.sistare@oracle.com> >>>> >>>> When migration for cpr is initiated, stop the vm and set state >>>> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the >>>> possibility of ram and device state being out of sync, and guarantees >>>> that a guest in the suspended state remains suspended, because >>>> qmp_cont >>>> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state. >>>> >>>> Signed-off-by: Steve Sistare<steven.sistare@oracle.com> >>>> Reviewed-by: Peter Xu<peterx@redhat.com> >>>> Link:https://urldefense.com/v3/__https://lore.kernel.org/r/1708622920-68779-11-git-send-email-steven.sistare@oracle.com__;!!GjvTz_vk!QLsFOCX-x2U9bzAo98SdidKlomHrmf_t0UmQKtgudoIcaDVoAJOPm39ZqaNP_nT5I8QqVfSgwhDZmg$ >>>> Signed-off-by: Peter Xu<peterx@redhat.com> >>>> --- >>>> include/migration/misc.h | 1 + >>>> migration/migration.h | 2 -- >>>> migration/migration.c | 51 >>>> ++++++++++++++++++++++++---------------- >>>> 3 files changed, 32 insertions(+), 22 deletions(-) >>>> >>>> diff --git a/include/migration/misc.h b/include/migration/misc.h >>>> index e4933b815b..5d1aa593ed 100644 >>>> --- a/include/migration/misc.h >>>> +++ b/include/migration/misc.h >>>> @@ -60,6 +60,7 @@ void migration_object_init(void); >>>> void migration_shutdown(void); >>>> bool migration_is_idle(void); >>>> bool migration_is_active(MigrationState *); >>>> +bool migrate_mode_is_cpr(MigrationState *); >>>> typedef enum MigrationEventType { >>>> MIG_EVENT_PRECOPY_SETUP, >>>> diff --git a/migration/migration.h b/migration/migration.h >>>> index aef8afbe1f..65c0b61cbd 100644 >>>> --- a/migration/migration.h >>>> +++ b/migration/migration.h >>>> @@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s); >>>> */ >>>> void migration_rp_kick(MigrationState *s); >>>> -int migration_stop_vm(RunState state); >>>> - >>>> #endif >>>> diff --git a/migration/migration.c b/migration/migration.c >>>> index 37c836b0b0..90a90947fb 100644 >>>> --- a/migration/migration.c >>>> +++ b/migration/migration.c >>>> @@ -167,11 +167,19 @@ static gint >>>> page_request_addr_cmp(gconstpointer ap, gconstpointer bp) >>>> return (a > b) - (a < b); >>>> } >>>> -int migration_stop_vm(RunState state) >>>> +static int migration_stop_vm(MigrationState *s, RunState state) >>>> { >>>> - int ret = vm_stop_force_state(state); >>>> + int ret; >>>> + >>>> + migration_downtime_start(s); >>>> + >>>> + s->vm_old_state = runstate_get(); >>>> + global_state_store(); >>>> + >>>> + ret = vm_stop_force_state(state); >>>> trace_vmstate_downtime_checkpoint("src-vm-stopped"); >>>> + trace_migration_completion_vm_stop(ret); >>>> return ret; >>>> } >>>> @@ -1602,6 +1610,11 @@ bool migration_is_active(MigrationState *s) >>>> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); >>>> } >>>> +bool migrate_mode_is_cpr(MigrationState *s) >>>> +{ >>>> + return s->parameters.mode == MIG_MODE_CPR_REBOOT; >>>> +} >>>> + >>>> int migrate_init(MigrationState *s, Error **errp) >>>> { >>>> int ret; >>>> @@ -2454,10 +2467,7 @@ static int postcopy_start(MigrationState >>>> *ms, Error **errp) >>>> bql_lock(); >>>> trace_postcopy_start_set_run(); >>>> - migration_downtime_start(ms); >>>> - >>>> - global_state_store(); >>>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >>>> + ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE); >>>> if (ret < 0) { >>>> goto fail; >>>> } >>>> @@ -2652,15 +2662,12 @@ static int >>>> migration_completion_precopy(MigrationState *s, >>>> int ret; >>>> bql_lock(); >>>> - migration_downtime_start(s); >>>> - >>>> - s->vm_old_state = runstate_get(); >>>> - global_state_store(); >>>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >>>> - trace_migration_completion_vm_stop(ret); >>>> - if (ret < 0) { >>>> - goto out_unlock; >>>> + if (!migrate_mode_is_cpr(s)) { >>>> + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); >>>> + if (ret < 0) { >>>> + goto out_unlock; >>>> + } >>>> } >>>> ret = migration_maybe_pause(s, current_active_state, >>>> @@ -3500,15 +3507,10 @@ static void *bg_migration_thread(void *opaque) >>>> s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - >>>> setup_start; >>>> trace_migration_thread_setup_complete(); >>>> - migration_downtime_start(s); >>>> bql_lock(); >>>> - s->vm_old_state = runstate_get(); >>>> - >>>> - global_state_store(); >>>> - /* Forcibly stop VM before saving state of vCPUs and devices */ >>>> - if (migration_stop_vm(RUN_STATE_PAUSED)) { >>>> + if (migration_stop_vm(s, RUN_STATE_PAUSED)) { >>>> goto fail; >>>> } >>>> /* >>>> @@ -3584,6 +3586,7 @@ void migrate_fd_connect(MigrationState *s, >>>> Error *error_in) >>>> Error *local_err = NULL; >>>> uint64_t rate_limit; >>>> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; >>>> + int ret; >>>> /* >>>> * If there's a previous error, free it and prepare for >>>> another one. >>>> @@ -3655,6 +3658,14 @@ void migrate_fd_connect(MigrationState *s, >>>> Error *error_in) >>>> return; >>>> } >>>> + if (migrate_mode_is_cpr(s)) { >>>> + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); >>>> + if (ret < 0) { >>>> + error_setg(&local_err, "migration_stop_vm failed, >>>> error %d", -ret); >>>> + goto fail; >>>> + } >>>> + } >>>> + >>>> if (migrate_background_snapshot()) { >>>> qemu_thread_create(&s->thread, "bg_snapshot", >>>> bg_migration_thread, s, QEMU_THREAD_JOINABLE); >
On 5/14/24 16:39, Michael Galaxy wrote: > Steve, > > OK, so it does not look like this bugfix you wrote was included in 8.2.4 > (which was released yesterday). Unfortunately, that means that anyone > using CPR in that release will still (eventually) encounter the bug like > I did. 8.2.4 is basically a "bugfix" release for 8.2.3 which I somewhat screwed up (in a minor way), plus a few currently (at the time) queued up changes. 8.2.3 was a big release though. > I would recommend that y'all consider cherry-picking, perhaps, the > relevant commits for a possible 8.2.5 ? Please Cc changes which are relevant for -stable to, well, qemu-stable@nongnu.org :) Which changes needs to be picked up? Thanks, /mjt
On 5/14/24 08:54, Michael Tokarev wrote: > On 5/14/24 16:39, Michael Galaxy wrote: >> Steve, >> >> OK, so it does not look like this bugfix you wrote was included in >> 8.2.4 (which was released yesterday). Unfortunately, that means that >> anyone using CPR in that release will still (eventually) encounter >> the bug like I did. > > 8.2.4 is basically a "bugfix" release for 8.2.3 which I somewhat > screwed up (in a minor way), plus a few currently (at the time) > queued up changes. 8.2.3 was a big release though. > >> I would recommend that y'all consider cherry-picking, perhaps, the >> relevant commits for a possible 8.2.5 ? > > Please Cc changes which are relevant for -stable to, well, > qemu-stable@nongnu.org :) > > Which changes needs to be picked up? > Steve, can you comment here, please? At a minimum, we have this one: [PULL 20/25] migration: stop vm for cpr But that pull came with a handful of other changes that are also not in QEMU v8, so I suspect I'm missing some other important changes that might be important for a stable release? - Michael > Thanks, > > /mjt
On 5/16/2024 1:24 PM, Michael Galaxy wrote: > On 5/14/24 08:54, Michael Tokarev wrote: >> On 5/14/24 16:39, Michael Galaxy wrote: >>> Steve, >>> >>> OK, so it does not look like this bugfix you wrote was included in 8.2.4 >>> (which was released yesterday). Unfortunately, that means that anyone using >>> CPR in that release will still (eventually) encounter the bug like I did. >> >> 8.2.4 is basically a "bugfix" release for 8.2.3 which I somewhat >> screwed up (in a minor way), plus a few currently (at the time) >> queued up changes. 8.2.3 was a big release though. >> >>> I would recommend that y'all consider cherry-picking, perhaps, the relevant >>> commits for a possible 8.2.5 ? >> >> Please Cc changes which are relevant for -stable to, well, >> qemu-stable@nongnu.org :) >> >> Which changes needs to be picked up? >> > Steve, can you comment here, please? At a minimum, we have this one: [PULL > 20/25] migration: stop vm for cpr > > But that pull came with a handful of other changes that are also not in QEMU v8, > so I suspect I'm missing some other important changes that might be important > for a stable release? > > - Michael Hi Michael, I sent the full list of commits to this distribution yesterday, and I see it in my Sent email folder. Copying verbatim: ---- Michael Galaxy, I'm afraid you are out of luck with respect to qemu 8.2. It has some of the cpr reboot commits, but is missing the following: 87a2848 migration: massage cpr-reboot documentation cbdafc1 migration: options incompatible with cpr ce5db1c migration: update cpr-reboot description 9867d4d migration: stop vm for cpr 4af667f migration: notifier error checking bf78a04 migration: refactor migrate_fd_connect failures 6835f5a migration: per-mode notifiers 5663dd3 migration: MigrationNotifyFunc c763a23e migration: remove postcopy_after_devices 9d9babf migration: MigrationEvent for notifiers 3e77573 migration: convert to NotifierWithReturn d91f33c migration: remove error from notifier data be19d83 notify: pass error to notifier with return b12635f migration: fix coverity migrate_mode finding 2b58a8b tests/qtest: postcopy migration with suspend b1fdd21 tests/qtest: precopy migration with suspend 5014478 tests/qtest: option to suspend during migration f064975 tests/qtest: migration events 49a5020 migration: preserve suspended for bg_migration 58b1057 migration: preserve suspended for snapshot b4e9ddc migration: preserve suspended runstate d3c86c99 migration: propagate suspended runstate 9ff5e79 cpus: vm_resume 0f1db06 cpus: check running not RUN_STATE_RUNNING b9ae473 cpus: stop vm in suspended runstate f06f316 cpus: vm_was_suspended All of those landed in qemu 9.0. --- - Steve
OK, acknowledged. Thanks, All. - Michael On 5/16/24 13:07, Steven Sistare wrote: > On 5/16/2024 1:24 PM, Michael Galaxy wrote: >> On 5/14/24 08:54, Michael Tokarev wrote: >>> On 5/14/24 16:39, Michael Galaxy wrote: >>>> Steve, >>>> >>>> OK, so it does not look like this bugfix you wrote was included in >>>> 8.2.4 (which was released yesterday). Unfortunately, that means >>>> that anyone using CPR in that release will still (eventually) >>>> encounter the bug like I did. >>> >>> 8.2.4 is basically a "bugfix" release for 8.2.3 which I somewhat >>> screwed up (in a minor way), plus a few currently (at the time) >>> queued up changes. 8.2.3 was a big release though. >>> >>>> I would recommend that y'all consider cherry-picking, perhaps, the >>>> relevant commits for a possible 8.2.5 ? >>> >>> Please Cc changes which are relevant for -stable to, well, >>> qemu-stable@nongnu.org :) >>> >>> Which changes needs to be picked up? >>> >> Steve, can you comment here, please? At a minimum, we have this one: >> [PULL 20/25] migration: stop vm for cpr >> >> But that pull came with a handful of other changes that are also not >> in QEMU v8, so I suspect I'm missing some other important changes >> that might be important for a stable release? >> >> - Michael > > Hi Michael, I sent the full list of commits to this distribution > yesterday, and > I see it in my Sent email folder. Copying verbatim: > > ---- > Michael Galaxy, I'm afraid you are out of luck with respect to qemu 8.2. > It has some of the cpr reboot commits, but is missing the following: > > 87a2848 migration: massage cpr-reboot documentation > cbdafc1 migration: options incompatible with cpr > ce5db1c migration: update cpr-reboot description > 9867d4d migration: stop vm for cpr > 4af667f migration: notifier error checking > bf78a04 migration: refactor migrate_fd_connect failures > 6835f5a migration: per-mode notifiers > 5663dd3 migration: MigrationNotifyFunc > c763a23e migration: remove postcopy_after_devices > 9d9babf migration: MigrationEvent for notifiers > 3e77573 migration: convert to NotifierWithReturn > d91f33c migration: remove error from notifier data > be19d83 notify: pass error to notifier with return > b12635f migration: fix coverity migrate_mode finding > 2b58a8b tests/qtest: postcopy migration with suspend > b1fdd21 tests/qtest: precopy migration with suspend > 5014478 tests/qtest: option to suspend during migration > f064975 tests/qtest: migration events > 49a5020 migration: preserve suspended for bg_migration > 58b1057 migration: preserve suspended for snapshot > b4e9ddc migration: preserve suspended runstate > d3c86c99 migration: propagate suspended runstate > 9ff5e79 cpus: vm_resume > 0f1db06 cpus: check running not RUN_STATE_RUNNING > b9ae473 cpus: stop vm in suspended runstate > f06f316 cpus: vm_was_suspended > > All of those landed in qemu 9.0. > --- > > - Steve
14.05.2024 16:54, Michael Tokarev пишет: > On 5/14/24 16:39, Michael Galaxy wrote: >> Steve, >> >> OK, so it does not look like this bugfix you wrote was included in 8.2.4 (which was released yesterday). Unfortunately, that means that anyone using >> CPR in that release will still (eventually) encounter the bug like I did. > > 8.2.4 is basically a "bugfix" release for 8.2.3 which I somewhat > screwed up (in a minor way), plus a few currently (at the time) > queued up changes. 8.2.3 was a big release though. > >> I would recommend that y'all consider cherry-picking, perhaps, the relevant commits for a possible 8.2.5 ? > > Please Cc changes which are relevant for -stable to, well, > qemu-stable@nongnu.org :) > > Which changes needs to be picked up? Please also note this particular change does not apply cleanly to stable-8.2 branch due to other changes in this area between 8.2 and 9.0, in particular, in postcopy_start(). So if this is to be picked up for stable-8.2, I need help from someone who better understands this code to select changes to pick. Thanks, /mjt -- GPG Key transition (from rsa2048 to rsa4096) since 2024-04-24. New key: rsa4096/61AD3D98ECDF2C8E 9D8B E14E 3F2A 9DD7 9199 28F1 61AD 3D98 ECDF 2C8E Old key: rsa2048/457CE0A0804465C5 6EE1 95D1 886E 8FFB 810D 4324 457C E0A0 8044 65C5 Transition statement: http://www.corpit.ru/mjt/gpg-transition-2024.txt
On 5/14/2024 11:33 AM, Michael Tokarev wrote: > 14.05.2024 16:54, Michael Tokarev пишет: >> On 5/14/24 16:39, Michael Galaxy wrote: >>> Steve, >>> >>> OK, so it does not look like this bugfix you wrote was included in 8.2.4 >>> (which was released yesterday). Unfortunately, that means that anyone using >>> CPR in that release will still (eventually) encounter the bug like I did. >> >> 8.2.4 is basically a "bugfix" release for 8.2.3 which I somewhat >> screwed up (in a minor way), plus a few currently (at the time) >> queued up changes. 8.2.3 was a big release though. >> >>> I would recommend that y'all consider cherry-picking, perhaps, the relevant >>> commits for a possible 8.2.5 ? >> >> Please Cc changes which are relevant for -stable to, well, >> qemu-stable@nongnu.org :) >> >> Which changes needs to be picked up? > Please also note this particular change does not apply cleanly to > stable-8.2 branch due to other changes in this area between 8.2 > and 9.0, in particular, in postcopy_start(). So if this is to be > picked up for stable-8.2, I need help from someone who better > understands this code to select changes to pick. > > Thanks, > > /mjt Michael Galaxy, I'm afraid you are out of luck with respect to qemu 8.2. It has some of the cpr reboot commits, but is missing the following: 87a2848 migration: massage cpr-reboot documentation cbdafc1 migration: options incompatible with cpr ce5db1c migration: update cpr-reboot description 9867d4d migration: stop vm for cpr 4af667f migration: notifier error checking bf78a04 migration: refactor migrate_fd_connect failures 6835f5a migration: per-mode notifiers 5663dd3 migration: MigrationNotifyFunc c763a23e migration: remove postcopy_after_devices 9d9babf migration: MigrationEvent for notifiers 3e77573 migration: convert to NotifierWithReturn d91f33c migration: remove error from notifier data be19d83 notify: pass error to notifier with return b12635f migration: fix coverity migrate_mode finding 2b58a8b tests/qtest: postcopy migration with suspend b1fdd21 tests/qtest: precopy migration with suspend 5014478 tests/qtest: option to suspend during migration f064975 tests/qtest: migration events 49a5020 migration: preserve suspended for bg_migration 58b1057 migration: preserve suspended for snapshot b4e9ddc migration: preserve suspended runstate d3c86c99 migration: propagate suspended runstate 9ff5e79 cpus: vm_resume 0f1db06 cpus: check running not RUN_STATE_RUNNING b9ae473 cpus: stop vm in suspended runstate f06f316 cpus: vm_was_suspended All of those landed in qemu 9.0. - Steve
© 2016 - 2024 Red Hat, Inc.