[v1] Migration: postcopy failure recovery

[Qemu-devel] [RFC 29/29] migration: reset migrate thread vars when resumed

Posted by Peter Xu 8 years, 6 months ago

Firstly, MigThrError enumeration is introduced to describe the error in
migration_detect_error() better. This gives the migration_thread() a
chance to know whether a recovery has happened.

Then, if a recovery is detected, migration_thread() will reset its local
variables to prepare for that.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 migration/migration.c | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index ecebe30..439bc22 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2159,6 +2159,15 @@ static bool postcopy_should_start(MigrationState *s)
     return atomic_read(&s->start_postcopy) || s->start_postcopy_fast;
 }
 
+typedef enum MigThrError {
+    /* No error detected */
+    MIG_THR_ERR_NONE = 0,
+    /* Detected error, but resumed successfully */
+    MIG_THR_ERR_RECOVERED = 1,
+    /* Detected fatal error, need to exit */
+    MIG_THR_ERR_FATAL = 2,
+} MigThrError;
+
 static int postcopy_resume_handshake(MigrationState *s)
 {
     qemu_mutex_lock(&s->resume_lock);
@@ -2209,10 +2218,10 @@ static int postcopy_do_resume(MigrationState *s)
 
 /*
  * We don't return until we are in a safe state to continue current
- * postcopy migration.  Returns true to continue the migration, or
- * false to terminate current migration.
+ * postcopy migration.  Returns MIG_THR_ERR_RECOVERED if recovered, or
+ * MIG_THR_ERR_FATAL if unrecovery failure happened.
  */
-static bool postcopy_pause(MigrationState *s)
+static MigThrError postcopy_pause(MigrationState *s)
 {
     assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
 
@@ -2247,7 +2256,7 @@ do_pause:
         if (postcopy_do_resume(s) == 0) {
             /* Let's continue! */
             trace_postcopy_pause_continued();
-            return true;
+            return MIG_THR_ERR_RECOVERED;
         } else {
             /*
              * Something wrong happened during the recovery, let's
@@ -2258,12 +2267,11 @@ do_pause:
         }
     } else {
         /* This is not right... Time to quit. */
-        return false;
+        return MIG_THR_ERR_FATAL;
     }
 }
 
-/* Return true if we want to stop the migration, otherwise false. */
-static bool migration_detect_error(MigrationState *s)
+static MigThrError migration_detect_error(MigrationState *s)
 {
     int ret;
 
@@ -2272,7 +2280,7 @@ static bool migration_detect_error(MigrationState *s)
 
     if (!ret) {
         /* Everything is fine */
-        return false;
+        return MIG_THR_ERR_NONE;
     }
 
     if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) {
@@ -2281,7 +2289,7 @@ static bool migration_detect_error(MigrationState *s)
          * while. After that, it can be continued by a
          * recovery phase.
          */
-        return !postcopy_pause(s);
+        return postcopy_pause(s);
     } else {
         /*
          * For precopy (or postcopy with error outside IO), we fail
@@ -2291,7 +2299,7 @@ static bool migration_detect_error(MigrationState *s)
         trace_migration_thread_file_err();
 
         /* Time to stop the migration, now. */
-        return true;
+        return MIG_THR_ERR_FATAL;
     }
 }
 
@@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque)
     /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
     enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
     bool enable_colo = migrate_colo_enabled();
+    MigThrError thr_error;
 
     rcu_register_thread();
 
@@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque)
          * Try to detect any kind of failures, and see whether we
          * should stop the migration now.
          */
-        if (migration_detect_error(s)) {
+        thr_error = migration_detect_error(s);
+        if (thr_error == MIG_THR_ERR_FATAL) {
+            /* Stop migration */
             break;
+        } else if (thr_error == MIG_THR_ERR_RECOVERED) {
+            /*
+             * Just recovered from a e.g. network failure, reset all
+             * the local variables.
+             */
+            initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+            initial_bytes = 0;
         }
 
         current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-- 
2.7.4

Re: [Qemu-devel] [RFC 29/29] migration: reset migrate thread vars when resumed

Posted by Dr. David Alan Gilbert 8 years, 6 months ago

* Peter Xu (peterx@redhat.com) wrote:
> Firstly, MigThrError enumeration is introduced to describe the error in
> migration_detect_error() better. This gives the migration_thread() a
> chance to know whether a recovery has happened.
> 
> Then, if a recovery is detected, migration_thread() will reset its local
> variables to prepare for that.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  migration/migration.c | 40 +++++++++++++++++++++++++++++-----------
>  1 file changed, 29 insertions(+), 11 deletions(-)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index ecebe30..439bc22 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -2159,6 +2159,15 @@ static bool postcopy_should_start(MigrationState *s)
>      return atomic_read(&s->start_postcopy) || s->start_postcopy_fast;
>  }
>  
> +typedef enum MigThrError {
> +    /* No error detected */
> +    MIG_THR_ERR_NONE = 0,
> +    /* Detected error, but resumed successfully */
> +    MIG_THR_ERR_RECOVERED = 1,
> +    /* Detected fatal error, need to exit */
> +    MIG_THR_ERR_FATAL = 2,
> +} MigThrError;
> +

Could you move this patch earlier to when postcopy_pause is created
so it's created with this enum?

>  static int postcopy_resume_handshake(MigrationState *s)
>  {
>      qemu_mutex_lock(&s->resume_lock);
> @@ -2209,10 +2218,10 @@ static int postcopy_do_resume(MigrationState *s)
>  
>  /*
>   * We don't return until we are in a safe state to continue current
> - * postcopy migration.  Returns true to continue the migration, or
> - * false to terminate current migration.
> + * postcopy migration.  Returns MIG_THR_ERR_RECOVERED if recovered, or
> + * MIG_THR_ERR_FATAL if unrecovery failure happened.
>   */
> -static bool postcopy_pause(MigrationState *s)
> +static MigThrError postcopy_pause(MigrationState *s)
>  {
>      assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
>  
> @@ -2247,7 +2256,7 @@ do_pause:
>          if (postcopy_do_resume(s) == 0) {
>              /* Let's continue! */
>              trace_postcopy_pause_continued();
> -            return true;
> +            return MIG_THR_ERR_RECOVERED;
>          } else {
>              /*
>               * Something wrong happened during the recovery, let's
> @@ -2258,12 +2267,11 @@ do_pause:
>          }
>      } else {
>          /* This is not right... Time to quit. */
> -        return false;
> +        return MIG_THR_ERR_FATAL;
>      }
>  }
>  
> -/* Return true if we want to stop the migration, otherwise false. */
> -static bool migration_detect_error(MigrationState *s)
> +static MigThrError migration_detect_error(MigrationState *s)
>  {
>      int ret;
>  
> @@ -2272,7 +2280,7 @@ static bool migration_detect_error(MigrationState *s)
>  
>      if (!ret) {
>          /* Everything is fine */
> -        return false;
> +        return MIG_THR_ERR_NONE;
>      }
>  
>      if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) {
> @@ -2281,7 +2289,7 @@ static bool migration_detect_error(MigrationState *s)
>           * while. After that, it can be continued by a
>           * recovery phase.
>           */
> -        return !postcopy_pause(s);
> +        return postcopy_pause(s);
>      } else {
>          /*
>           * For precopy (or postcopy with error outside IO), we fail
> @@ -2291,7 +2299,7 @@ static bool migration_detect_error(MigrationState *s)
>          trace_migration_thread_file_err();
>  
>          /* Time to stop the migration, now. */
> -        return true;
> +        return MIG_THR_ERR_FATAL;
>      }
>  }
>  
> @@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque)
>      /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
>      enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
>      bool enable_colo = migrate_colo_enabled();
> +    MigThrError thr_error;
>  
>      rcu_register_thread();
>  
> @@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque)
>           * Try to detect any kind of failures, and see whether we
>           * should stop the migration now.
>           */
> -        if (migration_detect_error(s)) {
> +        thr_error = migration_detect_error(s);
> +        if (thr_error == MIG_THR_ERR_FATAL) {
> +            /* Stop migration */
>              break;
> +        } else if (thr_error == MIG_THR_ERR_RECOVERED) {
> +            /*
> +             * Just recovered from a e.g. network failure, reset all
> +             * the local variables.
> +             */
> +            initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +            initial_bytes = 0;

They don't seem that important to reset?

Dave

>          }
>  
>          current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> -- 
> 2.7.4
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

Re: [Qemu-devel] [RFC 29/29] migration: reset migrate thread vars when resumed

Posted by Peter Xu 8 years, 6 months ago

On Thu, Aug 03, 2017 at 02:54:35PM +0100, Dr. David Alan Gilbert wrote:
> * Peter Xu (peterx@redhat.com) wrote:
> > Firstly, MigThrError enumeration is introduced to describe the error in
> > migration_detect_error() better. This gives the migration_thread() a
> > chance to know whether a recovery has happened.
> > 
> > Then, if a recovery is detected, migration_thread() will reset its local
> > variables to prepare for that.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  migration/migration.c | 40 +++++++++++++++++++++++++++++-----------
> >  1 file changed, 29 insertions(+), 11 deletions(-)
> > 
> > diff --git a/migration/migration.c b/migration/migration.c
> > index ecebe30..439bc22 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -2159,6 +2159,15 @@ static bool postcopy_should_start(MigrationState *s)
> >      return atomic_read(&s->start_postcopy) || s->start_postcopy_fast;
> >  }
> >  
> > +typedef enum MigThrError {
> > +    /* No error detected */
> > +    MIG_THR_ERR_NONE = 0,
> > +    /* Detected error, but resumed successfully */
> > +    MIG_THR_ERR_RECOVERED = 1,
> > +    /* Detected fatal error, need to exit */
> > +    MIG_THR_ERR_FATAL = 2,
> > +} MigThrError;
> > +
> 
> Could you move this patch earlier to when postcopy_pause is created
> so it's created with this enum?

Sure.

[...]

> > @@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque)
> >      /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
> >      enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
> >      bool enable_colo = migrate_colo_enabled();
> > +    MigThrError thr_error;
> >  
> >      rcu_register_thread();
> >  
> > @@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque)
> >           * Try to detect any kind of failures, and see whether we
> >           * should stop the migration now.
> >           */
> > -        if (migration_detect_error(s)) {
> > +        thr_error = migration_detect_error(s);
> > +        if (thr_error == MIG_THR_ERR_FATAL) {
> > +            /* Stop migration */
> >              break;
> > +        } else if (thr_error == MIG_THR_ERR_RECOVERED) {
> > +            /*
> > +             * Just recovered from a e.g. network failure, reset all
> > +             * the local variables.
> > +             */
> > +            initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > +            initial_bytes = 0;
> 
> They don't seem that important to reset?

The problem is that we have this in migration_thread():

        if (current_time >= initial_time + BUFFER_DELAY) {
            uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) -
                                         initial_bytes;
            uint64_t time_spent = current_time - initial_time;
            double bandwidth = (double)transferred_bytes / time_spent;
            threshold_size = bandwidth * s->parameters.downtime_limit;
            ...
        }

Here qemu_ftell() would possibly be very small since we have just
resumed... and then transferred_bytes will be extremely huge since
"qemu_ftell(s->to_dst_file) - initial_bytes" is actually negative...
Then, with luck, we'll got extremely huge "bandwidth" as well.

-- 
Peter Xu

Re: [Qemu-devel] [RFC 29/29] migration: reset migrate thread vars when resumed

Posted by Dr. David Alan Gilbert 8 years, 6 months ago

* Peter Xu (peterx@redhat.com) wrote:
> On Thu, Aug 03, 2017 at 02:54:35PM +0100, Dr. David Alan Gilbert wrote:
> > * Peter Xu (peterx@redhat.com) wrote:
> > > Firstly, MigThrError enumeration is introduced to describe the error in
> > > migration_detect_error() better. This gives the migration_thread() a
> > > chance to know whether a recovery has happened.
> > > 
> > > Then, if a recovery is detected, migration_thread() will reset its local
> > > variables to prepare for that.
> > > 
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > ---
> > >  migration/migration.c | 40 +++++++++++++++++++++++++++++-----------
> > >  1 file changed, 29 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/migration/migration.c b/migration/migration.c
> > > index ecebe30..439bc22 100644
> > > --- a/migration/migration.c
> > > +++ b/migration/migration.c
> > > @@ -2159,6 +2159,15 @@ static bool postcopy_should_start(MigrationState *s)
> > >      return atomic_read(&s->start_postcopy) || s->start_postcopy_fast;
> > >  }
> > >  
> > > +typedef enum MigThrError {
> > > +    /* No error detected */
> > > +    MIG_THR_ERR_NONE = 0,
> > > +    /* Detected error, but resumed successfully */
> > > +    MIG_THR_ERR_RECOVERED = 1,
> > > +    /* Detected fatal error, need to exit */
> > > +    MIG_THR_ERR_FATAL = 2,
> > > +} MigThrError;
> > > +
> > 
> > Could you move this patch earlier to when postcopy_pause is created
> > so it's created with this enum?
> 
> Sure.
> 
> [...]
> 
> > > @@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque)
> > >      /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
> > >      enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
> > >      bool enable_colo = migrate_colo_enabled();
> > > +    MigThrError thr_error;
> > >  
> > >      rcu_register_thread();
> > >  
> > > @@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque)
> > >           * Try to detect any kind of failures, and see whether we
> > >           * should stop the migration now.
> > >           */
> > > -        if (migration_detect_error(s)) {
> > > +        thr_error = migration_detect_error(s);
> > > +        if (thr_error == MIG_THR_ERR_FATAL) {
> > > +            /* Stop migration */
> > >              break;
> > > +        } else if (thr_error == MIG_THR_ERR_RECOVERED) {
> > > +            /*
> > > +             * Just recovered from a e.g. network failure, reset all
> > > +             * the local variables.
> > > +             */
> > > +            initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > > +            initial_bytes = 0;
> > 
> > They don't seem that important to reset?
> 
> The problem is that we have this in migration_thread():
> 
>         if (current_time >= initial_time + BUFFER_DELAY) {
>             uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) -
>                                          initial_bytes;
>             uint64_t time_spent = current_time - initial_time;
>             double bandwidth = (double)transferred_bytes / time_spent;
>             threshold_size = bandwidth * s->parameters.downtime_limit;
>             ...
>         }
> 
> Here qemu_ftell() would possibly be very small since we have just
> resumed... and then transferred_bytes will be extremely huge since
> "qemu_ftell(s->to_dst_file) - initial_bytes" is actually negative...
> Then, with luck, we'll got extremely huge "bandwidth" as well.

Ah yes that's a good reason to reset it then; add a comment like
'important to avoid breaking transferred_bytes and bandwidth
calculation'

Dave

> -- 
> Peter Xu
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

Re: [Qemu-devel] [RFC 29/29] migration: reset migrate thread vars when resumed

Posted by Peter Xu 8 years, 6 months ago

On Fri, Aug 04, 2017 at 10:52:27AM +0100, Dr. David Alan Gilbert wrote:
> * Peter Xu (peterx@redhat.com) wrote:
> > On Thu, Aug 03, 2017 at 02:54:35PM +0100, Dr. David Alan Gilbert wrote:

[...]

> > > > @@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque)
> > > >      /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
> > > >      enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
> > > >      bool enable_colo = migrate_colo_enabled();
> > > > +    MigThrError thr_error;
> > > >  
> > > >      rcu_register_thread();
> > > >  
> > > > @@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque)
> > > >           * Try to detect any kind of failures, and see whether we
> > > >           * should stop the migration now.
> > > >           */
> > > > -        if (migration_detect_error(s)) {
> > > > +        thr_error = migration_detect_error(s);
> > > > +        if (thr_error == MIG_THR_ERR_FATAL) {
> > > > +            /* Stop migration */
> > > >              break;
> > > > +        } else if (thr_error == MIG_THR_ERR_RECOVERED) {
> > > > +            /*
> > > > +             * Just recovered from a e.g. network failure, reset all
> > > > +             * the local variables.
> > > > +             */
> > > > +            initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > > > +            initial_bytes = 0;
> > > 
> > > They don't seem that important to reset?
> > 
> > The problem is that we have this in migration_thread():
> > 
> >         if (current_time >= initial_time + BUFFER_DELAY) {
> >             uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) -
> >                                          initial_bytes;
> >             uint64_t time_spent = current_time - initial_time;
> >             double bandwidth = (double)transferred_bytes / time_spent;
> >             threshold_size = bandwidth * s->parameters.downtime_limit;
> >             ...
> >         }
> > 
> > Here qemu_ftell() would possibly be very small since we have just
> > resumed... and then transferred_bytes will be extremely huge since
> > "qemu_ftell(s->to_dst_file) - initial_bytes" is actually negative...
> > Then, with luck, we'll got extremely huge "bandwidth" as well.
> 
> Ah yes that's a good reason to reset it then; add a comment like
> 'important to avoid breaking transferred_bytes and bandwidth
> calculation'

Will do.

-- 
Peter Xu