When migration for cpr is initiated, stop the vm and set state
RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the
possibility of ram and device state being out of sync, and guarantees
that a guest in the suspended state remains suspended, because qmp_cont
rejects a cont command in the RUN_STATE_FINISH_MIGRATE state.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
include/migration/misc.h | 1 +
migration/migration.c | 32 +++++++++++++++++++++++++-------
2 files changed, 26 insertions(+), 7 deletions(-)
diff --git a/include/migration/misc.h b/include/migration/misc.h
index 6dc234b..54c99a3 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -60,6 +60,7 @@ void migration_object_init(void);
void migration_shutdown(void);
bool migration_is_idle(void);
bool migration_is_active(MigrationState *);
+bool migrate_mode_is_cpr(MigrationState *);
typedef enum MigrationEventType {
MIG_EVENT_PRECOPY_SETUP,
diff --git a/migration/migration.c b/migration/migration.c
index d1fce9e..fc5c587 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s)
s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
}
+bool migrate_mode_is_cpr(MigrationState *s)
+{
+ return s->parameters.mode == MIG_MODE_CPR_REBOOT;
+}
+
int migrate_init(MigrationState *s, Error **errp)
{
int ret;
@@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s,
bql_lock();
migration_downtime_start(s);
- s->vm_old_state = runstate_get();
- global_state_store();
-
- ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
- trace_migration_completion_vm_stop(ret);
- if (ret < 0) {
- goto out_unlock;
+ if (!migrate_mode_is_cpr(s)) {
+ s->vm_old_state = runstate_get();
+ global_state_store();
+ ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
+ trace_migration_completion_vm_stop(ret);
+ if (ret < 0) {
+ goto out_unlock;
+ }
}
ret = migration_maybe_pause(s, current_active_state,
@@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
Error *local_err = NULL;
uint64_t rate_limit;
bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
+ int ret;
/*
* If there's a previous error, free it and prepare for another one.
@@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
goto fail;
}
+ if (migrate_mode_is_cpr(s)) {
+ s->vm_old_state = runstate_get();
+ global_state_store();
+ ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
+ trace_migration_completion_vm_stop(ret);
+ if (ret < 0) {
+ error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
+ goto fail;
+ }
+ }
+
if (migrate_background_snapshot()) {
qemu_thread_create(&s->thread, "bg_snapshot",
bg_migration_thread, s, QEMU_THREAD_JOINABLE);
--
1.8.3.1
On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote:
> When migration for cpr is initiated, stop the vm and set state
> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the
> possibility of ram and device state being out of sync, and guarantees
> that a guest in the suspended state remains suspended, because qmp_cont
> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
> include/migration/misc.h | 1 +
> migration/migration.c | 32 +++++++++++++++++++++++++-------
> 2 files changed, 26 insertions(+), 7 deletions(-)
>
> diff --git a/include/migration/misc.h b/include/migration/misc.h
> index 6dc234b..54c99a3 100644
> --- a/include/migration/misc.h
> +++ b/include/migration/misc.h
> @@ -60,6 +60,7 @@ void migration_object_init(void);
> void migration_shutdown(void);
> bool migration_is_idle(void);
> bool migration_is_active(MigrationState *);
> +bool migrate_mode_is_cpr(MigrationState *);
>
> typedef enum MigrationEventType {
> MIG_EVENT_PRECOPY_SETUP,
> diff --git a/migration/migration.c b/migration/migration.c
> index d1fce9e..fc5c587 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s)
> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
> }
>
> +bool migrate_mode_is_cpr(MigrationState *s)
> +{
> + return s->parameters.mode == MIG_MODE_CPR_REBOOT;
> +}
> +
> int migrate_init(MigrationState *s, Error **errp)
> {
> int ret;
> @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s,
> bql_lock();
> migration_downtime_start(s);
>
> - s->vm_old_state = runstate_get();
> - global_state_store();
> -
> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> - trace_migration_completion_vm_stop(ret);
> - if (ret < 0) {
> - goto out_unlock;
> + if (!migrate_mode_is_cpr(s)) {
> + s->vm_old_state = runstate_get();
> + global_state_store();
> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> + trace_migration_completion_vm_stop(ret);
> + if (ret < 0) {
> + goto out_unlock;
> + }
> }
>
> ret = migration_maybe_pause(s, current_active_state,
> @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
> Error *local_err = NULL;
> uint64_t rate_limit;
> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
> + int ret;
>
> /*
> * If there's a previous error, free it and prepare for another one.
> @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
> goto fail;
> }
>
> + if (migrate_mode_is_cpr(s)) {
> + s->vm_old_state = runstate_get();
> + global_state_store();
> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> + trace_migration_completion_vm_stop(ret);
> + if (ret < 0) {
> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
> + goto fail;
> + }
> + }
Could we have a helper function for the shared codes?
How about postcopy? I know it's nonsense to enable postcopy for cpr.. but
iiuc we don't yet forbid an user doing so. Maybe we should?
> +
> if (migrate_background_snapshot()) {
> qemu_thread_create(&s->thread, "bg_snapshot",
> bg_migration_thread, s, QEMU_THREAD_JOINABLE);
> --
> 1.8.3.1
>
--
Peter Xu
On 2/20/2024 2:33 AM, Peter Xu wrote:
> On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote:
>> When migration for cpr is initiated, stop the vm and set state
>> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the
>> possibility of ram and device state being out of sync, and guarantees
>> that a guest in the suspended state remains suspended, because qmp_cont
>> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>> include/migration/misc.h | 1 +
>> migration/migration.c | 32 +++++++++++++++++++++++++-------
>> 2 files changed, 26 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/migration/misc.h b/include/migration/misc.h
>> index 6dc234b..54c99a3 100644
>> --- a/include/migration/misc.h
>> +++ b/include/migration/misc.h
>> @@ -60,6 +60,7 @@ void migration_object_init(void);
>> void migration_shutdown(void);
>> bool migration_is_idle(void);
>> bool migration_is_active(MigrationState *);
>> +bool migrate_mode_is_cpr(MigrationState *);
>>
>> typedef enum MigrationEventType {
>> MIG_EVENT_PRECOPY_SETUP,
>> diff --git a/migration/migration.c b/migration/migration.c
>> index d1fce9e..fc5c587 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s)
>> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
>> }
>>
>> +bool migrate_mode_is_cpr(MigrationState *s)
>> +{
>> + return s->parameters.mode == MIG_MODE_CPR_REBOOT;
>> +}
>> +
>> int migrate_init(MigrationState *s, Error **errp)
>> {
>> int ret;
>> @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s,
>> bql_lock();
>> migration_downtime_start(s);
>>
>> - s->vm_old_state = runstate_get();
>> - global_state_store();
>> -
>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>> - trace_migration_completion_vm_stop(ret);
>> - if (ret < 0) {
>> - goto out_unlock;
>> + if (!migrate_mode_is_cpr(s)) {
>> + s->vm_old_state = runstate_get();
>> + global_state_store();
>> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>> + trace_migration_completion_vm_stop(ret);
>> + if (ret < 0) {
>> + goto out_unlock;
>> + }
>> }
>>
>> ret = migration_maybe_pause(s, current_active_state,
>> @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>> Error *local_err = NULL;
>> uint64_t rate_limit;
>> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
>> + int ret;
>>
>> /*
>> * If there's a previous error, free it and prepare for another one.
>> @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>> goto fail;
>> }
>>
>> + if (migrate_mode_is_cpr(s)) {
>> + s->vm_old_state = runstate_get();
>> + global_state_store();
>> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>> + trace_migration_completion_vm_stop(ret);
>> + if (ret < 0) {
>> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
>> + goto fail;
>> + }
>> + }
>
> Could we have a helper function for the shared codes?
I propose to add code to migration_stop_vm to make it the helper. Some call sites emit
more traces (via migration_stop_vm) as a result of my refactoring, and postcopy start sets
vm_old_state, which is not used thereafter in that path. Those changes seem harmless to me.
Tell me what you think:
-------------------------------------------------------
diff --git a/migration/migration.c b/migration/migration.c
index fc5c587..30d2b08 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -107,12 +107,6 @@ static int migration_maybe_pause(MigrationState *s,
static void migrate_fd_cancel(MigrationState *s);
static bool close_return_path_on_source(MigrationState *s);
-static void migration_downtime_start(MigrationState *s)
-{
- trace_vmstate_downtime_checkpoint("src-downtime-start");
- s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-}
-
static void migration_downtime_end(MigrationState *s)
{
int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
@@ -161,11 +155,20 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpo
return (a > b) - (a < b);
}
-int migration_stop_vm(RunState state)
+static int migration_stop_vm(MigrationState *s, RunState state)
{
- int ret = vm_stop_force_state(state);
+ int ret;
+
+ trace_vmstate_downtime_checkpoint("src-downtime-start");
+ s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+ s->vm_old_state = runstate_get();
+ global_state_store();
+
+ ret = vm_stop_force_state(state);
trace_vmstate_downtime_checkpoint("src-vm-stopped");
+ trace_migration_completion_vm_stop(ret);
return ret;
}
@@ -2454,10 +2457,7 @@ static int postcopy_start(MigrationState *ms, Error **errp)
bql_lock();
trace_postcopy_start_set_run();
- migration_downtime_start(ms);
-
- global_state_store();
- ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
+ ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE);
if (ret < 0) {
goto fail;
}
@@ -2654,13 +2654,9 @@ static int migration_completion_precopy(MigrationState *s,
int ret;
bql_lock();
- migration_downtime_start(s);
if (!migrate_mode_is_cpr(s)) {
- s->vm_old_state = runstate_get();
- global_state_store();
- ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
- trace_migration_completion_vm_stop(ret);
+ ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
if (ret < 0) {
goto out_unlock;
}
@@ -3498,15 +3494,10 @@ static void *bg_migration_thread(void *opaque)
s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
trace_migration_thread_setup_complete();
- migration_downtime_start(s);
bql_lock();
- s->vm_old_state = runstate_get();
-
- global_state_store();
- /* Forcibly stop VM before saving state of vCPUs and devices */
- if (migration_stop_vm(RUN_STATE_PAUSED)) {
+ if (migration_stop_vm(s, RUN_STATE_PAUSED)) {
goto fail;
}
/*
@@ -3659,10 +3650,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
}
if (migrate_mode_is_cpr(s)) {
- s->vm_old_state = runstate_get();
- global_state_store();
- ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
- trace_migration_completion_vm_stop(ret);
+ ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
if (ret < 0) {
error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
goto fail;
diff --git a/migration/migration.h b/migration/migration.h
index aef8afb..65c0b61 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s);
*/
void migration_rp_kick(MigrationState *s);
-int migration_stop_vm(RunState state);
-
#endif
-------------------------------------------------------
- Steve
On Wed, Feb 21, 2024 at 04:20:07PM -0500, Steven Sistare wrote:
> On 2/20/2024 2:33 AM, Peter Xu wrote:
> > On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote:
> >> When migration for cpr is initiated, stop the vm and set state
> >> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the
> >> possibility of ram and device state being out of sync, and guarantees
> >> that a guest in the suspended state remains suspended, because qmp_cont
> >> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state.
> >>
> >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> >> ---
> >> include/migration/misc.h | 1 +
> >> migration/migration.c | 32 +++++++++++++++++++++++++-------
> >> 2 files changed, 26 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/include/migration/misc.h b/include/migration/misc.h
> >> index 6dc234b..54c99a3 100644
> >> --- a/include/migration/misc.h
> >> +++ b/include/migration/misc.h
> >> @@ -60,6 +60,7 @@ void migration_object_init(void);
> >> void migration_shutdown(void);
> >> bool migration_is_idle(void);
> >> bool migration_is_active(MigrationState *);
> >> +bool migrate_mode_is_cpr(MigrationState *);
> >>
> >> typedef enum MigrationEventType {
> >> MIG_EVENT_PRECOPY_SETUP,
> >> diff --git a/migration/migration.c b/migration/migration.c
> >> index d1fce9e..fc5c587 100644
> >> --- a/migration/migration.c
> >> +++ b/migration/migration.c
> >> @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s)
> >> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
> >> }
> >>
> >> +bool migrate_mode_is_cpr(MigrationState *s)
> >> +{
> >> + return s->parameters.mode == MIG_MODE_CPR_REBOOT;
> >> +}
> >> +
> >> int migrate_init(MigrationState *s, Error **errp)
> >> {
> >> int ret;
> >> @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s,
> >> bql_lock();
> >> migration_downtime_start(s);
> >>
> >> - s->vm_old_state = runstate_get();
> >> - global_state_store();
> >> -
> >> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> >> - trace_migration_completion_vm_stop(ret);
> >> - if (ret < 0) {
> >> - goto out_unlock;
> >> + if (!migrate_mode_is_cpr(s)) {
> >> + s->vm_old_state = runstate_get();
> >> + global_state_store();
> >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> >> + trace_migration_completion_vm_stop(ret);
> >> + if (ret < 0) {
> >> + goto out_unlock;
> >> + }
> >> }
> >>
> >> ret = migration_maybe_pause(s, current_active_state,
> >> @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
> >> Error *local_err = NULL;
> >> uint64_t rate_limit;
> >> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
> >> + int ret;
> >>
> >> /*
> >> * If there's a previous error, free it and prepare for another one.
> >> @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
> >> goto fail;
> >> }
> >>
> >> + if (migrate_mode_is_cpr(s)) {
> >> + s->vm_old_state = runstate_get();
> >> + global_state_store();
> >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> >> + trace_migration_completion_vm_stop(ret);
> >> + if (ret < 0) {
> >> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
> >> + goto fail;
> >> + }
> >> + }
> >
> > Could we have a helper function for the shared codes?
>
> I propose to add code to migration_stop_vm to make it the helper. Some call sites emit
> more traces (via migration_stop_vm) as a result of my refactoring,
This should be fine.
> and postcopy start sets
> vm_old_state, which is not used thereafter in that path. Those changes seem harmless to me.
Not only harmless, I think it was a bug to not set vm_old_state in
postcopy_start().. See:
https://issues.redhat.com/browse/RHEL-18061
I suspect that was the cause.
> Tell me what you think:
I'll have a closer look later, but so far it looks all good.
Thanks,
>
> -------------------------------------------------------
> diff --git a/migration/migration.c b/migration/migration.c
> index fc5c587..30d2b08 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -107,12 +107,6 @@ static int migration_maybe_pause(MigrationState *s,
> static void migrate_fd_cancel(MigrationState *s);
> static bool close_return_path_on_source(MigrationState *s);
>
> -static void migration_downtime_start(MigrationState *s)
> -{
> - trace_vmstate_downtime_checkpoint("src-downtime-start");
> - s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> -}
> -
> static void migration_downtime_end(MigrationState *s)
> {
> int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> @@ -161,11 +155,20 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpo
> return (a > b) - (a < b);
> }
>
> -int migration_stop_vm(RunState state)
> +static int migration_stop_vm(MigrationState *s, RunState state)
> {
> - int ret = vm_stop_force_state(state);
> + int ret;
> +
> + trace_vmstate_downtime_checkpoint("src-downtime-start");
> + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +
> + s->vm_old_state = runstate_get();
> + global_state_store();
> +
> + ret = vm_stop_force_state(state);
>
> trace_vmstate_downtime_checkpoint("src-vm-stopped");
> + trace_migration_completion_vm_stop(ret);
>
> return ret;
> }
> @@ -2454,10 +2457,7 @@ static int postcopy_start(MigrationState *ms, Error **errp)
> bql_lock();
> trace_postcopy_start_set_run();
>
> - migration_downtime_start(ms);
> -
> - global_state_store();
> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> + ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE);
> if (ret < 0) {
> goto fail;
> }
> @@ -2654,13 +2654,9 @@ static int migration_completion_precopy(MigrationState *s,
> int ret;
>
> bql_lock();
> - migration_downtime_start(s);
>
> if (!migrate_mode_is_cpr(s)) {
> - s->vm_old_state = runstate_get();
> - global_state_store();
> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> - trace_migration_completion_vm_stop(ret);
> + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
> if (ret < 0) {
> goto out_unlock;
> }
> @@ -3498,15 +3494,10 @@ static void *bg_migration_thread(void *opaque)
> s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
>
> trace_migration_thread_setup_complete();
> - migration_downtime_start(s);
>
> bql_lock();
>
> - s->vm_old_state = runstate_get();
> -
> - global_state_store();
> - /* Forcibly stop VM before saving state of vCPUs and devices */
> - if (migration_stop_vm(RUN_STATE_PAUSED)) {
> + if (migration_stop_vm(s, RUN_STATE_PAUSED)) {
> goto fail;
> }
> /*
> @@ -3659,10 +3650,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
> }
>
> if (migrate_mode_is_cpr(s)) {
> - s->vm_old_state = runstate_get();
> - global_state_store();
> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> - trace_migration_completion_vm_stop(ret);
> + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
> if (ret < 0) {
> error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
> goto fail;
> diff --git a/migration/migration.h b/migration/migration.h
> index aef8afb..65c0b61 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s);
> */
> void migration_rp_kick(MigrationState *s);
>
> -int migration_stop_vm(RunState state);
> -
> #endif
> -------------------------------------------------------
>
> - Steve
>
--
Peter Xu
On Thu, Feb 22, 2024 at 05:12:53PM +0800, Peter Xu wrote:
> On Wed, Feb 21, 2024 at 04:20:07PM -0500, Steven Sistare wrote:
> > On 2/20/2024 2:33 AM, Peter Xu wrote:
> > > On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote:
> > >> When migration for cpr is initiated, stop the vm and set state
> > >> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the
> > >> possibility of ram and device state being out of sync, and guarantees
> > >> that a guest in the suspended state remains suspended, because qmp_cont
> > >> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state.
> > >>
> > >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> > >> ---
> > >> include/migration/misc.h | 1 +
> > >> migration/migration.c | 32 +++++++++++++++++++++++++-------
> > >> 2 files changed, 26 insertions(+), 7 deletions(-)
> > >>
> > >> diff --git a/include/migration/misc.h b/include/migration/misc.h
> > >> index 6dc234b..54c99a3 100644
> > >> --- a/include/migration/misc.h
> > >> +++ b/include/migration/misc.h
> > >> @@ -60,6 +60,7 @@ void migration_object_init(void);
> > >> void migration_shutdown(void);
> > >> bool migration_is_idle(void);
> > >> bool migration_is_active(MigrationState *);
> > >> +bool migrate_mode_is_cpr(MigrationState *);
> > >>
> > >> typedef enum MigrationEventType {
> > >> MIG_EVENT_PRECOPY_SETUP,
> > >> diff --git a/migration/migration.c b/migration/migration.c
> > >> index d1fce9e..fc5c587 100644
> > >> --- a/migration/migration.c
> > >> +++ b/migration/migration.c
> > >> @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s)
> > >> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
> > >> }
> > >>
> > >> +bool migrate_mode_is_cpr(MigrationState *s)
> > >> +{
> > >> + return s->parameters.mode == MIG_MODE_CPR_REBOOT;
> > >> +}
> > >> +
> > >> int migrate_init(MigrationState *s, Error **errp)
> > >> {
> > >> int ret;
> > >> @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s,
> > >> bql_lock();
> > >> migration_downtime_start(s);
> > >>
> > >> - s->vm_old_state = runstate_get();
> > >> - global_state_store();
> > >> -
> > >> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> > >> - trace_migration_completion_vm_stop(ret);
> > >> - if (ret < 0) {
> > >> - goto out_unlock;
> > >> + if (!migrate_mode_is_cpr(s)) {
> > >> + s->vm_old_state = runstate_get();
> > >> + global_state_store();
> > >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> > >> + trace_migration_completion_vm_stop(ret);
> > >> + if (ret < 0) {
> > >> + goto out_unlock;
> > >> + }
> > >> }
> > >>
> > >> ret = migration_maybe_pause(s, current_active_state,
> > >> @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
> > >> Error *local_err = NULL;
> > >> uint64_t rate_limit;
> > >> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
> > >> + int ret;
> > >>
> > >> /*
> > >> * If there's a previous error, free it and prepare for another one.
> > >> @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
> > >> goto fail;
> > >> }
> > >>
> > >> + if (migrate_mode_is_cpr(s)) {
> > >> + s->vm_old_state = runstate_get();
> > >> + global_state_store();
> > >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> > >> + trace_migration_completion_vm_stop(ret);
> > >> + if (ret < 0) {
> > >> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
> > >> + goto fail;
> > >> + }
> > >> + }
> > >
> > > Could we have a helper function for the shared codes?
> >
> > I propose to add code to migration_stop_vm to make it the helper. Some call sites emit
> > more traces (via migration_stop_vm) as a result of my refactoring,
>
> This should be fine.
>
> > and postcopy start sets
> > vm_old_state, which is not used thereafter in that path. Those changes seem harmless to me.
>
> Not only harmless, I think it was a bug to not set vm_old_state in
> postcopy_start().. See:
>
> https://issues.redhat.com/browse/RHEL-18061
>
> I suspect that was the cause.
>
> > Tell me what you think:
>
> I'll have a closer look later, but so far it looks all good.
>
> Thanks,
>
> >
> > -------------------------------------------------------
> > diff --git a/migration/migration.c b/migration/migration.c
> > index fc5c587..30d2b08 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -107,12 +107,6 @@ static int migration_maybe_pause(MigrationState *s,
> > static void migrate_fd_cancel(MigrationState *s);
> > static bool close_return_path_on_source(MigrationState *s);
> >
> > -static void migration_downtime_start(MigrationState *s)
> > -{
> > - trace_vmstate_downtime_checkpoint("src-downtime-start");
> > - s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > -}
Ah.. one more thing: would you mind keep this helper even if it can be
squashed when sending formal patch? I want to keep downtime start/end
super clear and paired as they're important hook points. It should be
inlined anyway by the compiler.
> > -
> > static void migration_downtime_end(MigrationState *s)
> > {
> > int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > @@ -161,11 +155,20 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpo
> > return (a > b) - (a < b);
> > }
> >
> > -int migration_stop_vm(RunState state)
> > +static int migration_stop_vm(MigrationState *s, RunState state)
> > {
> > - int ret = vm_stop_force_state(state);
> > + int ret;
> > +
> > + trace_vmstate_downtime_checkpoint("src-downtime-start");
> > + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > +
> > + s->vm_old_state = runstate_get();
> > + global_state_store();
> > +
> > + ret = vm_stop_force_state(state);
> >
> > trace_vmstate_downtime_checkpoint("src-vm-stopped");
> > + trace_migration_completion_vm_stop(ret);
> >
> > return ret;
> > }
> > @@ -2454,10 +2457,7 @@ static int postcopy_start(MigrationState *ms, Error **errp)
> > bql_lock();
> > trace_postcopy_start_set_run();
> >
> > - migration_downtime_start(ms);
> > -
> > - global_state_store();
> > - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> > + ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE);
> > if (ret < 0) {
> > goto fail;
> > }
> > @@ -2654,13 +2654,9 @@ static int migration_completion_precopy(MigrationState *s,
> > int ret;
> >
> > bql_lock();
> > - migration_downtime_start(s);
> >
> > if (!migrate_mode_is_cpr(s)) {
> > - s->vm_old_state = runstate_get();
> > - global_state_store();
> > - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> > - trace_migration_completion_vm_stop(ret);
> > + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
> > if (ret < 0) {
> > goto out_unlock;
> > }
> > @@ -3498,15 +3494,10 @@ static void *bg_migration_thread(void *opaque)
> > s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
> >
> > trace_migration_thread_setup_complete();
> > - migration_downtime_start(s);
> >
> > bql_lock();
> >
> > - s->vm_old_state = runstate_get();
> > -
> > - global_state_store();
> > - /* Forcibly stop VM before saving state of vCPUs and devices */
> > - if (migration_stop_vm(RUN_STATE_PAUSED)) {
> > + if (migration_stop_vm(s, RUN_STATE_PAUSED)) {
> > goto fail;
> > }
> > /*
> > @@ -3659,10 +3650,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
> > }
> >
> > if (migrate_mode_is_cpr(s)) {
> > - s->vm_old_state = runstate_get();
> > - global_state_store();
> > - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
> > - trace_migration_completion_vm_stop(ret);
> > + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
> > if (ret < 0) {
> > error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
> > goto fail;
> > diff --git a/migration/migration.h b/migration/migration.h
> > index aef8afb..65c0b61 100644
> > --- a/migration/migration.h
> > +++ b/migration/migration.h
> > @@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s);
> > */
> > void migration_rp_kick(MigrationState *s);
> >
> > -int migration_stop_vm(RunState state);
> > -
> > #endif
> > -------------------------------------------------------
> >
> > - Steve
> >
>
> --
> Peter Xu
--
Peter Xu
On 2/22/2024 4:30 AM, Peter Xu wrote:
> On Thu, Feb 22, 2024 at 05:12:53PM +0800, Peter Xu wrote:
>> On Wed, Feb 21, 2024 at 04:20:07PM -0500, Steven Sistare wrote:
>>> On 2/20/2024 2:33 AM, Peter Xu wrote:
>>>> On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote:
>>>>> When migration for cpr is initiated, stop the vm and set state
>>>>> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the
>>>>> possibility of ram and device state being out of sync, and guarantees
>>>>> that a guest in the suspended state remains suspended, because qmp_cont
>>>>> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state.
>>>>>
>>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>>>> ---
>>>>> include/migration/misc.h | 1 +
>>>>> migration/migration.c | 32 +++++++++++++++++++++++++-------
>>>>> 2 files changed, 26 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/include/migration/misc.h b/include/migration/misc.h
>>>>> index 6dc234b..54c99a3 100644
>>>>> --- a/include/migration/misc.h
>>>>> +++ b/include/migration/misc.h
>>>>> @@ -60,6 +60,7 @@ void migration_object_init(void);
>>>>> void migration_shutdown(void);
>>>>> bool migration_is_idle(void);
>>>>> bool migration_is_active(MigrationState *);
>>>>> +bool migrate_mode_is_cpr(MigrationState *);
>>>>>
>>>>> typedef enum MigrationEventType {
>>>>> MIG_EVENT_PRECOPY_SETUP,
>>>>> diff --git a/migration/migration.c b/migration/migration.c
>>>>> index d1fce9e..fc5c587 100644
>>>>> --- a/migration/migration.c
>>>>> +++ b/migration/migration.c
>>>>> @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s)
>>>>> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
>>>>> }
>>>>>
>>>>> +bool migrate_mode_is_cpr(MigrationState *s)
>>>>> +{
>>>>> + return s->parameters.mode == MIG_MODE_CPR_REBOOT;
>>>>> +}
>>>>> +
>>>>> int migrate_init(MigrationState *s, Error **errp)
>>>>> {
>>>>> int ret;
>>>>> @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s,
>>>>> bql_lock();
>>>>> migration_downtime_start(s);
>>>>>
>>>>> - s->vm_old_state = runstate_get();
>>>>> - global_state_store();
>>>>> -
>>>>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>>>>> - trace_migration_completion_vm_stop(ret);
>>>>> - if (ret < 0) {
>>>>> - goto out_unlock;
>>>>> + if (!migrate_mode_is_cpr(s)) {
>>>>> + s->vm_old_state = runstate_get();
>>>>> + global_state_store();
>>>>> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>>>>> + trace_migration_completion_vm_stop(ret);
>>>>> + if (ret < 0) {
>>>>> + goto out_unlock;
>>>>> + }
>>>>> }
>>>>>
>>>>> ret = migration_maybe_pause(s, current_active_state,
>>>>> @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>>>>> Error *local_err = NULL;
>>>>> uint64_t rate_limit;
>>>>> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
>>>>> + int ret;
>>>>>
>>>>> /*
>>>>> * If there's a previous error, free it and prepare for another one.
>>>>> @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>>>>> goto fail;
>>>>> }
>>>>>
>>>>> + if (migrate_mode_is_cpr(s)) {
>>>>> + s->vm_old_state = runstate_get();
>>>>> + global_state_store();
>>>>> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>>>>> + trace_migration_completion_vm_stop(ret);
>>>>> + if (ret < 0) {
>>>>> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
>>>>> + goto fail;
>>>>> + }
>>>>> + }
>>>>
>>>> Could we have a helper function for the shared codes?
>>>
>>> I propose to add code to migration_stop_vm to make it the helper. Some call sites emit
>>> more traces (via migration_stop_vm) as a result of my refactoring,
>>
>> This should be fine.
>>
>>> and postcopy start sets
>>> vm_old_state, which is not used thereafter in that path. Those changes seem harmless to me.
>>
>> Not only harmless, I think it was a bug to not set vm_old_state in
>> postcopy_start().. See:
>>
>> https://issues.redhat.com/browse/RHEL-18061
>>
>> I suspect that was the cause.
>>
>>> Tell me what you think:
>>
>> I'll have a closer look later, but so far it looks all good.
>>
>> Thanks,
>>
>>>
>>> -------------------------------------------------------
>>> diff --git a/migration/migration.c b/migration/migration.c
>>> index fc5c587..30d2b08 100644
>>> --- a/migration/migration.c
>>> +++ b/migration/migration.c
>>> @@ -107,12 +107,6 @@ static int migration_maybe_pause(MigrationState *s,
>>> static void migrate_fd_cancel(MigrationState *s);
>>> static bool close_return_path_on_source(MigrationState *s);
>>>
>>> -static void migration_downtime_start(MigrationState *s)
>>> -{
>>> - trace_vmstate_downtime_checkpoint("src-downtime-start");
>>> - s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>>> -}
>
> Ah.. one more thing: would you mind keep this helper even if it can be
> squashed when sending formal patch? I want to keep downtime start/end
> super clear and paired as they're important hook points. It should be
> inlined anyway by the compiler.
Will do - steve
>>> -
>>> static void migration_downtime_end(MigrationState *s)
>>> {
>>> int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>>> @@ -161,11 +155,20 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpo
>>> return (a > b) - (a < b);
>>> }
>>>
>>> -int migration_stop_vm(RunState state)
>>> +static int migration_stop_vm(MigrationState *s, RunState state)
>>> {
>>> - int ret = vm_stop_force_state(state);
>>> + int ret;
>>> +
>>> + trace_vmstate_downtime_checkpoint("src-downtime-start");
>>> + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>>> +
>>> + s->vm_old_state = runstate_get();
>>> + global_state_store();
>>> +
>>> + ret = vm_stop_force_state(state);
>>>
>>> trace_vmstate_downtime_checkpoint("src-vm-stopped");
>>> + trace_migration_completion_vm_stop(ret);
>>>
>>> return ret;
>>> }
>>> @@ -2454,10 +2457,7 @@ static int postcopy_start(MigrationState *ms, Error **errp)
>>> bql_lock();
>>> trace_postcopy_start_set_run();
>>>
>>> - migration_downtime_start(ms);
>>> -
>>> - global_state_store();
>>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>>> + ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE);
>>> if (ret < 0) {
>>> goto fail;
>>> }
>>> @@ -2654,13 +2654,9 @@ static int migration_completion_precopy(MigrationState *s,
>>> int ret;
>>>
>>> bql_lock();
>>> - migration_downtime_start(s);
>>>
>>> if (!migrate_mode_is_cpr(s)) {
>>> - s->vm_old_state = runstate_get();
>>> - global_state_store();
>>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>>> - trace_migration_completion_vm_stop(ret);
>>> + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
>>> if (ret < 0) {
>>> goto out_unlock;
>>> }
>>> @@ -3498,15 +3494,10 @@ static void *bg_migration_thread(void *opaque)
>>> s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
>>>
>>> trace_migration_thread_setup_complete();
>>> - migration_downtime_start(s);
>>>
>>> bql_lock();
>>>
>>> - s->vm_old_state = runstate_get();
>>> -
>>> - global_state_store();
>>> - /* Forcibly stop VM before saving state of vCPUs and devices */
>>> - if (migration_stop_vm(RUN_STATE_PAUSED)) {
>>> + if (migration_stop_vm(s, RUN_STATE_PAUSED)) {
>>> goto fail;
>>> }
>>> /*
>>> @@ -3659,10 +3650,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>>> }
>>>
>>> if (migrate_mode_is_cpr(s)) {
>>> - s->vm_old_state = runstate_get();
>>> - global_state_store();
>>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>>> - trace_migration_completion_vm_stop(ret);
>>> + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
>>> if (ret < 0) {
>>> error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
>>> goto fail;
>>> diff --git a/migration/migration.h b/migration/migration.h
>>> index aef8afb..65c0b61 100644
>>> --- a/migration/migration.h
>>> +++ b/migration/migration.h
>>> @@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s);
>>> */
>>> void migration_rp_kick(MigrationState *s);
>>>
>>> -int migration_stop_vm(RunState state);
>>> -
>>> #endif
>>> -------------------------------------------------------
>>>
>>> - Steve
>>>
>>
>> --
>> Peter Xu
>
On 2/20/2024 2:33 AM, Peter Xu wrote:
> On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote:
>> When migration for cpr is initiated, stop the vm and set state
>> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the
>> possibility of ram and device state being out of sync, and guarantees
>> that a guest in the suspended state remains suspended, because qmp_cont
>> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>> include/migration/misc.h | 1 +
>> migration/migration.c | 32 +++++++++++++++++++++++++-------
>> 2 files changed, 26 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/migration/misc.h b/include/migration/misc.h
>> index 6dc234b..54c99a3 100644
>> --- a/include/migration/misc.h
>> +++ b/include/migration/misc.h
>> @@ -60,6 +60,7 @@ void migration_object_init(void);
>> void migration_shutdown(void);
>> bool migration_is_idle(void);
>> bool migration_is_active(MigrationState *);
>> +bool migrate_mode_is_cpr(MigrationState *);
>>
>> typedef enum MigrationEventType {
>> MIG_EVENT_PRECOPY_SETUP,
>> diff --git a/migration/migration.c b/migration/migration.c
>> index d1fce9e..fc5c587 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s)
>> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
>> }
>>
>> +bool migrate_mode_is_cpr(MigrationState *s)
>> +{
>> + return s->parameters.mode == MIG_MODE_CPR_REBOOT;
>> +}
>> +
>> int migrate_init(MigrationState *s, Error **errp)
>> {
>> int ret;
>> @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s,
>> bql_lock();
>> migration_downtime_start(s);
>>
>> - s->vm_old_state = runstate_get();
>> - global_state_store();
>> -
>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>> - trace_migration_completion_vm_stop(ret);
>> - if (ret < 0) {
>> - goto out_unlock;
>> + if (!migrate_mode_is_cpr(s)) {
>> + s->vm_old_state = runstate_get();
>> + global_state_store();
>> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>> + trace_migration_completion_vm_stop(ret);
>> + if (ret < 0) {
>> + goto out_unlock;
>> + }
>> }
>>
>> ret = migration_maybe_pause(s, current_active_state,
>> @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>> Error *local_err = NULL;
>> uint64_t rate_limit;
>> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
>> + int ret;
>>
>> /*
>> * If there's a previous error, free it and prepare for another one.
>> @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>> goto fail;
>> }
>>
>> + if (migrate_mode_is_cpr(s)) {
>> + s->vm_old_state = runstate_get();
>> + global_state_store();
>> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>> + trace_migration_completion_vm_stop(ret);
>> + if (ret < 0) {
>> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
>> + goto fail;
>> + }
>> + }
>
> Could we have a helper function for the shared codes?
>
> How about postcopy? I know it's nonsense to enable postcopy for cpr.. but
> iiuc we don't yet forbid an user doing so. Maybe we should?
How about this?
-------------------------------------------
@@ -3600,6 +3600,11 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
return;
}
+ if (migrate_mode_is_cpr(s) && migrate_postcopy()) {
+ error_setg(&local_err, "cannot mix postcopy and cpr");
+ goto fail;
+ }
+
if (resume) {
/* This is a resumed migration */
rate_limit = migrate_max_postcopy_bandwidth();
------------------------------------------------
- Steve
On Wed, Feb 21, 2024 at 04:23:07PM -0500, Steven Sistare wrote:
> > How about postcopy? I know it's nonsense to enable postcopy for cpr.. but
> > iiuc we don't yet forbid an user doing so. Maybe we should?
>
> How about this?
>
> -------------------------------------------
> @@ -3600,6 +3600,11 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
> return;
> }
>
> + if (migrate_mode_is_cpr(s) && migrate_postcopy()) {
> + error_setg(&local_err, "cannot mix postcopy and cpr");
> + goto fail;
> + }
> +
> if (resume) {
> /* This is a resumed migration */
> rate_limit = migrate_max_postcopy_bandwidth();
> ------------------------------------------------
migrate_fd_connect() will be a bit late, the error won't be able to be
attached in the "migrate" request. Perhaps, migrate_prepare()?
--
Peter Xu
On 2/22/2024 4:03 AM, Peter Xu wrote:
> On Wed, Feb 21, 2024 at 04:23:07PM -0500, Steven Sistare wrote:
>>> How about postcopy? I know it's nonsense to enable postcopy for cpr.. but
>>> iiuc we don't yet forbid an user doing so. Maybe we should?
>>
>> How about this?
>>
>> -------------------------------------------
>> @@ -3600,6 +3600,11 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>> return;
>> }
>>
>> + if (migrate_mode_is_cpr(s) && migrate_postcopy()) {
>> + error_setg(&local_err, "cannot mix postcopy and cpr");
>> + goto fail;
>> + }
>> +
>> if (resume) {
>> /* This is a resumed migration */
>> rate_limit = migrate_max_postcopy_bandwidth();
>> ------------------------------------------------
>
> migrate_fd_connect() will be a bit late, the error won't be able to be
> attached in the "migrate" request. Perhaps, migrate_prepare()?
Thank you, that is better - steve
On 2/20/2024 2:33 AM, Peter Xu wrote:
> On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote:
>> When migration for cpr is initiated, stop the vm and set state
>> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the
>> possibility of ram and device state being out of sync, and guarantees
>> that a guest in the suspended state remains suspended, because qmp_cont
>> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>> include/migration/misc.h | 1 +
>> migration/migration.c | 32 +++++++++++++++++++++++++-------
>> 2 files changed, 26 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/migration/misc.h b/include/migration/misc.h
>> index 6dc234b..54c99a3 100644
>> --- a/include/migration/misc.h
>> +++ b/include/migration/misc.h
>> @@ -60,6 +60,7 @@ void migration_object_init(void);
>> void migration_shutdown(void);
>> bool migration_is_idle(void);
>> bool migration_is_active(MigrationState *);
>> +bool migrate_mode_is_cpr(MigrationState *);
>>
>> typedef enum MigrationEventType {
>> MIG_EVENT_PRECOPY_SETUP,
>> diff --git a/migration/migration.c b/migration/migration.c
>> index d1fce9e..fc5c587 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s)
>> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
>> }
>>
>> +bool migrate_mode_is_cpr(MigrationState *s)
>> +{
>> + return s->parameters.mode == MIG_MODE_CPR_REBOOT;
>> +}
>> +
>> int migrate_init(MigrationState *s, Error **errp)
>> {
>> int ret;
>> @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s,
>> bql_lock();
>> migration_downtime_start(s);
>>
>> - s->vm_old_state = runstate_get();
>> - global_state_store();
>> -
>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>> - trace_migration_completion_vm_stop(ret);
>> - if (ret < 0) {
>> - goto out_unlock;
>> + if (!migrate_mode_is_cpr(s)) {
>> + s->vm_old_state = runstate_get();
>> + global_state_store();
>> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>> + trace_migration_completion_vm_stop(ret);
>> + if (ret < 0) {
>> + goto out_unlock;
>> + }
>> }
>>
>> ret = migration_maybe_pause(s, current_active_state,
>> @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>> Error *local_err = NULL;
>> uint64_t rate_limit;
>> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
>> + int ret;
>>
>> /*
>> * If there's a previous error, free it and prepare for another one.
>> @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>> goto fail;
>> }
>>
>> + if (migrate_mode_is_cpr(s)) {
>> + s->vm_old_state = runstate_get();
>> + global_state_store();
>> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
>> + trace_migration_completion_vm_stop(ret);
>> + if (ret < 0) {
>> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
>> + goto fail;
>> + }
>> + }
>
> Could we have a helper function for the shared codes?
Will do.
> How about postcopy? I know it's nonsense to enable postcopy for cpr.. but
> iiuc we don't yet forbid an user doing so. Maybe we should?
I will assert that mode != cpr in the postcopy path instead, to prevent any nonsense.
- Steve
>> +
>> if (migrate_background_snapshot()) {
>> qemu_thread_create(&s->thread, "bg_snapshot",
>> bg_migration_thread, s, QEMU_THREAD_JOINABLE);
>> --
>> 1.8.3.1
>>
>
© 2016 - 2026 Red Hat, Inc.