Firstly, MigThrError enumeration is introduced to describe the error in
migration_detect_error() better. This gives the migration_thread() a
chance to know whether a recovery has happened.
Then, if a recovery is detected, migration_thread() will reset its local
variables to prepare for that.
Signed-off-by: Peter Xu <peterx@redhat.com>
---
migration/migration.c | 40 +++++++++++++++++++++++++++++-----------
1 file changed, 29 insertions(+), 11 deletions(-)
diff --git a/migration/migration.c b/migration/migration.c
index ecebe30..439bc22 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2159,6 +2159,15 @@ static bool postcopy_should_start(MigrationState *s)
return atomic_read(&s->start_postcopy) || s->start_postcopy_fast;
}
+typedef enum MigThrError {
+ /* No error detected */
+ MIG_THR_ERR_NONE = 0,
+ /* Detected error, but resumed successfully */
+ MIG_THR_ERR_RECOVERED = 1,
+ /* Detected fatal error, need to exit */
+ MIG_THR_ERR_FATAL = 2,
+} MigThrError;
+
static int postcopy_resume_handshake(MigrationState *s)
{
qemu_mutex_lock(&s->resume_lock);
@@ -2209,10 +2218,10 @@ static int postcopy_do_resume(MigrationState *s)
/*
* We don't return until we are in a safe state to continue current
- * postcopy migration. Returns true to continue the migration, or
- * false to terminate current migration.
+ * postcopy migration. Returns MIG_THR_ERR_RECOVERED if recovered, or
+ * MIG_THR_ERR_FATAL if unrecovery failure happened.
*/
-static bool postcopy_pause(MigrationState *s)
+static MigThrError postcopy_pause(MigrationState *s)
{
assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
@@ -2247,7 +2256,7 @@ do_pause:
if (postcopy_do_resume(s) == 0) {
/* Let's continue! */
trace_postcopy_pause_continued();
- return true;
+ return MIG_THR_ERR_RECOVERED;
} else {
/*
* Something wrong happened during the recovery, let's
@@ -2258,12 +2267,11 @@ do_pause:
}
} else {
/* This is not right... Time to quit. */
- return false;
+ return MIG_THR_ERR_FATAL;
}
}
-/* Return true if we want to stop the migration, otherwise false. */
-static bool migration_detect_error(MigrationState *s)
+static MigThrError migration_detect_error(MigrationState *s)
{
int ret;
@@ -2272,7 +2280,7 @@ static bool migration_detect_error(MigrationState *s)
if (!ret) {
/* Everything is fine */
- return false;
+ return MIG_THR_ERR_NONE;
}
if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) {
@@ -2281,7 +2289,7 @@ static bool migration_detect_error(MigrationState *s)
* while. After that, it can be continued by a
* recovery phase.
*/
- return !postcopy_pause(s);
+ return postcopy_pause(s);
} else {
/*
* For precopy (or postcopy with error outside IO), we fail
@@ -2291,7 +2299,7 @@ static bool migration_detect_error(MigrationState *s)
trace_migration_thread_file_err();
/* Time to stop the migration, now. */
- return true;
+ return MIG_THR_ERR_FATAL;
}
}
@@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque)
/* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
bool enable_colo = migrate_colo_enabled();
+ MigThrError thr_error;
rcu_register_thread();
@@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque)
* Try to detect any kind of failures, and see whether we
* should stop the migration now.
*/
- if (migration_detect_error(s)) {
+ thr_error = migration_detect_error(s);
+ if (thr_error == MIG_THR_ERR_FATAL) {
+ /* Stop migration */
break;
+ } else if (thr_error == MIG_THR_ERR_RECOVERED) {
+ /*
+ * Just recovered from a e.g. network failure, reset all
+ * the local variables.
+ */
+ initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ initial_bytes = 0;
}
current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
--
2.7.4
* Peter Xu (peterx@redhat.com) wrote:
> Firstly, MigThrError enumeration is introduced to describe the error in
> migration_detect_error() better. This gives the migration_thread() a
> chance to know whether a recovery has happened.
>
> Then, if a recovery is detected, migration_thread() will reset its local
> variables to prepare for that.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
> migration/migration.c | 40 +++++++++++++++++++++++++++++-----------
> 1 file changed, 29 insertions(+), 11 deletions(-)
>
> diff --git a/migration/migration.c b/migration/migration.c
> index ecebe30..439bc22 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -2159,6 +2159,15 @@ static bool postcopy_should_start(MigrationState *s)
> return atomic_read(&s->start_postcopy) || s->start_postcopy_fast;
> }
>
> +typedef enum MigThrError {
> + /* No error detected */
> + MIG_THR_ERR_NONE = 0,
> + /* Detected error, but resumed successfully */
> + MIG_THR_ERR_RECOVERED = 1,
> + /* Detected fatal error, need to exit */
> + MIG_THR_ERR_FATAL = 2,
> +} MigThrError;
> +
Could you move this patch earlier to when postcopy_pause is created
so it's created with this enum?
> static int postcopy_resume_handshake(MigrationState *s)
> {
> qemu_mutex_lock(&s->resume_lock);
> @@ -2209,10 +2218,10 @@ static int postcopy_do_resume(MigrationState *s)
>
> /*
> * We don't return until we are in a safe state to continue current
> - * postcopy migration. Returns true to continue the migration, or
> - * false to terminate current migration.
> + * postcopy migration. Returns MIG_THR_ERR_RECOVERED if recovered, or
> + * MIG_THR_ERR_FATAL if unrecovery failure happened.
> */
> -static bool postcopy_pause(MigrationState *s)
> +static MigThrError postcopy_pause(MigrationState *s)
> {
> assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
>
> @@ -2247,7 +2256,7 @@ do_pause:
> if (postcopy_do_resume(s) == 0) {
> /* Let's continue! */
> trace_postcopy_pause_continued();
> - return true;
> + return MIG_THR_ERR_RECOVERED;
> } else {
> /*
> * Something wrong happened during the recovery, let's
> @@ -2258,12 +2267,11 @@ do_pause:
> }
> } else {
> /* This is not right... Time to quit. */
> - return false;
> + return MIG_THR_ERR_FATAL;
> }
> }
>
> -/* Return true if we want to stop the migration, otherwise false. */
> -static bool migration_detect_error(MigrationState *s)
> +static MigThrError migration_detect_error(MigrationState *s)
> {
> int ret;
>
> @@ -2272,7 +2280,7 @@ static bool migration_detect_error(MigrationState *s)
>
> if (!ret) {
> /* Everything is fine */
> - return false;
> + return MIG_THR_ERR_NONE;
> }
>
> if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) {
> @@ -2281,7 +2289,7 @@ static bool migration_detect_error(MigrationState *s)
> * while. After that, it can be continued by a
> * recovery phase.
> */
> - return !postcopy_pause(s);
> + return postcopy_pause(s);
> } else {
> /*
> * For precopy (or postcopy with error outside IO), we fail
> @@ -2291,7 +2299,7 @@ static bool migration_detect_error(MigrationState *s)
> trace_migration_thread_file_err();
>
> /* Time to stop the migration, now. */
> - return true;
> + return MIG_THR_ERR_FATAL;
> }
> }
>
> @@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque)
> /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
> enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
> bool enable_colo = migrate_colo_enabled();
> + MigThrError thr_error;
>
> rcu_register_thread();
>
> @@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque)
> * Try to detect any kind of failures, and see whether we
> * should stop the migration now.
> */
> - if (migration_detect_error(s)) {
> + thr_error = migration_detect_error(s);
> + if (thr_error == MIG_THR_ERR_FATAL) {
> + /* Stop migration */
> break;
> + } else if (thr_error == MIG_THR_ERR_RECOVERED) {
> + /*
> + * Just recovered from a e.g. network failure, reset all
> + * the local variables.
> + */
> + initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> + initial_bytes = 0;
They don't seem that important to reset?
Dave
> }
>
> current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> --
> 2.7.4
>
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On Thu, Aug 03, 2017 at 02:54:35PM +0100, Dr. David Alan Gilbert wrote:
> * Peter Xu (peterx@redhat.com) wrote:
> > Firstly, MigThrError enumeration is introduced to describe the error in
> > migration_detect_error() better. This gives the migration_thread() a
> > chance to know whether a recovery has happened.
> >
> > Then, if a recovery is detected, migration_thread() will reset its local
> > variables to prepare for that.
> >
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> > migration/migration.c | 40 +++++++++++++++++++++++++++++-----------
> > 1 file changed, 29 insertions(+), 11 deletions(-)
> >
> > diff --git a/migration/migration.c b/migration/migration.c
> > index ecebe30..439bc22 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -2159,6 +2159,15 @@ static bool postcopy_should_start(MigrationState *s)
> > return atomic_read(&s->start_postcopy) || s->start_postcopy_fast;
> > }
> >
> > +typedef enum MigThrError {
> > + /* No error detected */
> > + MIG_THR_ERR_NONE = 0,
> > + /* Detected error, but resumed successfully */
> > + MIG_THR_ERR_RECOVERED = 1,
> > + /* Detected fatal error, need to exit */
> > + MIG_THR_ERR_FATAL = 2,
> > +} MigThrError;
> > +
>
> Could you move this patch earlier to when postcopy_pause is created
> so it's created with this enum?
Sure.
[...]
> > @@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque)
> > /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
> > enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
> > bool enable_colo = migrate_colo_enabled();
> > + MigThrError thr_error;
> >
> > rcu_register_thread();
> >
> > @@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque)
> > * Try to detect any kind of failures, and see whether we
> > * should stop the migration now.
> > */
> > - if (migration_detect_error(s)) {
> > + thr_error = migration_detect_error(s);
> > + if (thr_error == MIG_THR_ERR_FATAL) {
> > + /* Stop migration */
> > break;
> > + } else if (thr_error == MIG_THR_ERR_RECOVERED) {
> > + /*
> > + * Just recovered from a e.g. network failure, reset all
> > + * the local variables.
> > + */
> > + initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > + initial_bytes = 0;
>
> They don't seem that important to reset?
The problem is that we have this in migration_thread():
if (current_time >= initial_time + BUFFER_DELAY) {
uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) -
initial_bytes;
uint64_t time_spent = current_time - initial_time;
double bandwidth = (double)transferred_bytes / time_spent;
threshold_size = bandwidth * s->parameters.downtime_limit;
...
}
Here qemu_ftell() would possibly be very small since we have just
resumed... and then transferred_bytes will be extremely huge since
"qemu_ftell(s->to_dst_file) - initial_bytes" is actually negative...
Then, with luck, we'll got extremely huge "bandwidth" as well.
--
Peter Xu
* Peter Xu (peterx@redhat.com) wrote:
> On Thu, Aug 03, 2017 at 02:54:35PM +0100, Dr. David Alan Gilbert wrote:
> > * Peter Xu (peterx@redhat.com) wrote:
> > > Firstly, MigThrError enumeration is introduced to describe the error in
> > > migration_detect_error() better. This gives the migration_thread() a
> > > chance to know whether a recovery has happened.
> > >
> > > Then, if a recovery is detected, migration_thread() will reset its local
> > > variables to prepare for that.
> > >
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > ---
> > > migration/migration.c | 40 +++++++++++++++++++++++++++++-----------
> > > 1 file changed, 29 insertions(+), 11 deletions(-)
> > >
> > > diff --git a/migration/migration.c b/migration/migration.c
> > > index ecebe30..439bc22 100644
> > > --- a/migration/migration.c
> > > +++ b/migration/migration.c
> > > @@ -2159,6 +2159,15 @@ static bool postcopy_should_start(MigrationState *s)
> > > return atomic_read(&s->start_postcopy) || s->start_postcopy_fast;
> > > }
> > >
> > > +typedef enum MigThrError {
> > > + /* No error detected */
> > > + MIG_THR_ERR_NONE = 0,
> > > + /* Detected error, but resumed successfully */
> > > + MIG_THR_ERR_RECOVERED = 1,
> > > + /* Detected fatal error, need to exit */
> > > + MIG_THR_ERR_FATAL = 2,
> > > +} MigThrError;
> > > +
> >
> > Could you move this patch earlier to when postcopy_pause is created
> > so it's created with this enum?
>
> Sure.
>
> [...]
>
> > > @@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque)
> > > /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
> > > enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
> > > bool enable_colo = migrate_colo_enabled();
> > > + MigThrError thr_error;
> > >
> > > rcu_register_thread();
> > >
> > > @@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque)
> > > * Try to detect any kind of failures, and see whether we
> > > * should stop the migration now.
> > > */
> > > - if (migration_detect_error(s)) {
> > > + thr_error = migration_detect_error(s);
> > > + if (thr_error == MIG_THR_ERR_FATAL) {
> > > + /* Stop migration */
> > > break;
> > > + } else if (thr_error == MIG_THR_ERR_RECOVERED) {
> > > + /*
> > > + * Just recovered from a e.g. network failure, reset all
> > > + * the local variables.
> > > + */
> > > + initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > > + initial_bytes = 0;
> >
> > They don't seem that important to reset?
>
> The problem is that we have this in migration_thread():
>
> if (current_time >= initial_time + BUFFER_DELAY) {
> uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) -
> initial_bytes;
> uint64_t time_spent = current_time - initial_time;
> double bandwidth = (double)transferred_bytes / time_spent;
> threshold_size = bandwidth * s->parameters.downtime_limit;
> ...
> }
>
> Here qemu_ftell() would possibly be very small since we have just
> resumed... and then transferred_bytes will be extremely huge since
> "qemu_ftell(s->to_dst_file) - initial_bytes" is actually negative...
> Then, with luck, we'll got extremely huge "bandwidth" as well.
Ah yes that's a good reason to reset it then; add a comment like
'important to avoid breaking transferred_bytes and bandwidth
calculation'
Dave
> --
> Peter Xu
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On Fri, Aug 04, 2017 at 10:52:27AM +0100, Dr. David Alan Gilbert wrote:
> * Peter Xu (peterx@redhat.com) wrote:
> > On Thu, Aug 03, 2017 at 02:54:35PM +0100, Dr. David Alan Gilbert wrote:
[...]
> > > > @@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque)
> > > > /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
> > > > enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
> > > > bool enable_colo = migrate_colo_enabled();
> > > > + MigThrError thr_error;
> > > >
> > > > rcu_register_thread();
> > > >
> > > > @@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque)
> > > > * Try to detect any kind of failures, and see whether we
> > > > * should stop the migration now.
> > > > */
> > > > - if (migration_detect_error(s)) {
> > > > + thr_error = migration_detect_error(s);
> > > > + if (thr_error == MIG_THR_ERR_FATAL) {
> > > > + /* Stop migration */
> > > > break;
> > > > + } else if (thr_error == MIG_THR_ERR_RECOVERED) {
> > > > + /*
> > > > + * Just recovered from a e.g. network failure, reset all
> > > > + * the local variables.
> > > > + */
> > > > + initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > > > + initial_bytes = 0;
> > >
> > > They don't seem that important to reset?
> >
> > The problem is that we have this in migration_thread():
> >
> > if (current_time >= initial_time + BUFFER_DELAY) {
> > uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) -
> > initial_bytes;
> > uint64_t time_spent = current_time - initial_time;
> > double bandwidth = (double)transferred_bytes / time_spent;
> > threshold_size = bandwidth * s->parameters.downtime_limit;
> > ...
> > }
> >
> > Here qemu_ftell() would possibly be very small since we have just
> > resumed... and then transferred_bytes will be extremely huge since
> > "qemu_ftell(s->to_dst_file) - initial_bytes" is actually negative...
> > Then, with luck, we'll got extremely huge "bandwidth" as well.
>
> Ah yes that's a good reason to reset it then; add a comment like
> 'important to avoid breaking transferred_bytes and bandwidth
> calculation'
Will do.
--
Peter Xu
© 2016 - 2026 Red Hat, Inc.