[v1] colo: migration related bugfixes

[PATCH 4/6] migration/colo.c: Relaunch failover even if there was an error

Posted by Lukas Straub 5 years, 9 months ago

If vmstate_loading is true, secondary_vm_do_failover will set failover
status to FAILOVER_STATUS_RELAUNCH and return success without initiating
failover. However, if there is an error during the vmstate_loading
section, failover isn't relaunched. Instead we then wait for
failover on colo_incoming_sem.

Fix this by relaunching failover even if there was an error. Also,
to make this work properly, set vmstate_loading to false when
returning during the vmstate_loading section.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
---
 migration/colo.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/migration/colo.c b/migration/colo.c
index 2947363ae5..a69782efc5 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -743,6 +743,7 @@ static void colo_incoming_process_checkpoint(MigrationIncomingState *mis,
     ret = qemu_load_device_state(fb);
     if (ret < 0) {
         error_setg(errp, "COLO: load device state failed");
+        vmstate_loading = false;
         qemu_mutex_unlock_iothread();
         return;
     }
@@ -751,6 +752,7 @@ static void colo_incoming_process_checkpoint(MigrationIncomingState *mis,
     replication_get_error_all(&local_err);
     if (local_err) {
         error_propagate(errp, local_err);
+        vmstate_loading = false;
         qemu_mutex_unlock_iothread();
         return;
     }
@@ -759,6 +761,7 @@ static void colo_incoming_process_checkpoint(MigrationIncomingState *mis,
     replication_do_checkpoint_all(&local_err);
     if (local_err) {
         error_propagate(errp, local_err);
+        vmstate_loading = false;
         qemu_mutex_unlock_iothread();
         return;
     }
@@ -770,6 +773,7 @@ static void colo_incoming_process_checkpoint(MigrationIncomingState *mis,
 
     if (local_err) {
         error_propagate(errp, local_err);
+        vmstate_loading = false;
         qemu_mutex_unlock_iothread();
         return;
     }
@@ -780,9 +784,6 @@ static void colo_incoming_process_checkpoint(MigrationIncomingState *mis,
     qemu_mutex_unlock_iothread();
 
     if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
-        failover_set_state(FAILOVER_STATUS_RELAUNCH,
-                        FAILOVER_STATUS_NONE);
-        failover_request_active(NULL);
         return;
     }
 
@@ -881,6 +882,14 @@ void *colo_process_incoming_thread(void *opaque)
             error_report_err(local_err);
             break;
         }
+
+        if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
+            failover_set_state(FAILOVER_STATUS_RELAUNCH,
+                            FAILOVER_STATUS_NONE);
+            failover_request_active(NULL);
+            break;
+        }
+
         if (failover_get_state() != FAILOVER_STATUS_NONE) {
             error_report("failover request");
             break;
@@ -888,8 +897,6 @@ void *colo_process_incoming_thread(void *opaque)
     }
 
 out:
-    vmstate_loading = false;
-
     /*
      * There are only two reasons we can get here, some error happened
      * or the user triggered failover.
-- 
2.20.1

RE: [PATCH 4/6] migration/colo.c: Relaunch failover even if there was an error

Posted by Zhanghailiang 5 years, 8 months ago

Reviewed-by: zhanghailiang <zhang.zhanghailiang@huawei.com>

> -----Original Message-----
> From: Lukas Straub [mailto:lukasstraub2@web.de]
> Sent: Monday, May 11, 2020 7:11 PM
> To: qemu-devel <qemu-devel@nongnu.org>
> Cc: Zhanghailiang <zhang.zhanghailiang@huawei.com>; Juan Quintela
> <quintela@redhat.com>; Dr. David Alan Gilbert <dgilbert@redhat.com>
> Subject: [PATCH 4/6] migration/colo.c: Relaunch failover even if there was an
> error
> 
> If vmstate_loading is true, secondary_vm_do_failover will set failover status
> to FAILOVER_STATUS_RELAUNCH and return success without initiating
> failover. However, if there is an error during the vmstate_loading section,
> failover isn't relaunched. Instead we then wait for failover on
> colo_incoming_sem.
> 
> Fix this by relaunching failover even if there was an error. Also, to make this
> work properly, set vmstate_loading to false when returning during the
> vmstate_loading section.
> 
> Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> ---
>  migration/colo.c | 17 ++++++++++++-----
>  1 file changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/migration/colo.c b/migration/colo.c index
> 2947363ae5..a69782efc5 100644
> --- a/migration/colo.c
> +++ b/migration/colo.c
> @@ -743,6 +743,7 @@ static void
> colo_incoming_process_checkpoint(MigrationIncomingState *mis,
>      ret = qemu_load_device_state(fb);
>      if (ret < 0) {
>          error_setg(errp, "COLO: load device state failed");
> +        vmstate_loading = false;
>          qemu_mutex_unlock_iothread();
>          return;
>      }
> @@ -751,6 +752,7 @@ static void
> colo_incoming_process_checkpoint(MigrationIncomingState *mis,
>      replication_get_error_all(&local_err);
>      if (local_err) {
>          error_propagate(errp, local_err);
> +        vmstate_loading = false;
>          qemu_mutex_unlock_iothread();
>          return;
>      }
> @@ -759,6 +761,7 @@ static void
> colo_incoming_process_checkpoint(MigrationIncomingState *mis,
>      replication_do_checkpoint_all(&local_err);
>      if (local_err) {
>          error_propagate(errp, local_err);
> +        vmstate_loading = false;
>          qemu_mutex_unlock_iothread();
>          return;
>      }
> @@ -770,6 +773,7 @@ static void
> colo_incoming_process_checkpoint(MigrationIncomingState *mis,
> 
>      if (local_err) {
>          error_propagate(errp, local_err);
> +        vmstate_loading = false;
>          qemu_mutex_unlock_iothread();
>          return;
>      }
> @@ -780,9 +784,6 @@ static void
> colo_incoming_process_checkpoint(MigrationIncomingState *mis,
>      qemu_mutex_unlock_iothread();
> 
>      if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
> -        failover_set_state(FAILOVER_STATUS_RELAUNCH,
> -                        FAILOVER_STATUS_NONE);
> -        failover_request_active(NULL);
>          return;
>      }
> 
> @@ -881,6 +882,14 @@ void *colo_process_incoming_thread(void
> *opaque)
>              error_report_err(local_err);
>              break;
>          }
> +
> +        if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
> +            failover_set_state(FAILOVER_STATUS_RELAUNCH,
> +                            FAILOVER_STATUS_NONE);
> +            failover_request_active(NULL);
> +            break;
> +        }
> +
>          if (failover_get_state() != FAILOVER_STATUS_NONE) {
>              error_report("failover request");
>              break;
> @@ -888,8 +897,6 @@ void *colo_process_incoming_thread(void *opaque)
>      }
> 
>  out:
> -    vmstate_loading = false;
> -
>      /*
>       * There are only two reasons we can get here, some error happened
>       * or the user triggered failover.
> --
> 2.20.1

[PATCH 1/6] migration/colo.c: Use event instead of semaphore
[PATCH 2/6] migration/colo.c: Use cpu_synchronize_all_states()
[PATCH 3/6] migration/colo.c: Flush ram cache only after receiving device state
[PATCH 4/6] migration/colo.c: Relaunch failover even if there was an error
[PATCH 5/6] migration/qemu-file.c: Don't ratelimit a shutdown fd
[PATCH 6/6] migration/colo.c: Move colo_notify_compares_event to the right place