[PATCH 1/2] migration: Add a file_error argument to close_return_path_on_source()

Cédric Le Goater posted 2 patches 9 months, 4 weeks ago
Maintainers: Peter Xu <peterx@redhat.com>, Fabiano Rosas <farosas@suse.de>
[PATCH 1/2] migration: Add a file_error argument to close_return_path_on_source()
Posted by Cédric Le Goater 9 months, 4 weeks ago
close_return_path_on_source() retrieves the migration error from the
the QEMUFile '->to_dst_file' to know if a shutdown is required to exit
the return-path thread. However, in migrate_fd_cleanup(), '->to_dst_file'
is cleaned up before calling close_return_path_on_source() and the
shutdown is never performed, leaving the source and destination
waiting for an event to occur.

Cache the file error in a temporary variable and pass it to
close_return_path_on_source() to avoid relying on '->to_dst_file'.

Signed-off-by: Cédric Le Goater <clg@redhat.com>
---
 migration/migration.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index d5f705ceef4c925589aa49335969672c0d761fa2..2c3362235c7651c11d581f3c3639571f1f9636ef 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -100,7 +100,7 @@ static int migration_maybe_pause(MigrationState *s,
                                  int *current_active_state,
                                  int new_state);
 static void migrate_fd_cancel(MigrationState *s);
-static bool close_return_path_on_source(MigrationState *s);
+static bool close_return_path_on_source(MigrationState *s, int eror);
 
 static void migration_downtime_start(MigrationState *s)
 {
@@ -1313,6 +1313,8 @@ void migrate_set_state(int *state, int old_state, int new_state)
 
 static void migrate_fd_cleanup(MigrationState *s)
 {
+    int file_error = 0;
+
     g_free(s->hostname);
     s->hostname = NULL;
     json_writer_free(s->vmdesc);
@@ -1333,6 +1335,7 @@ static void migrate_fd_cleanup(MigrationState *s)
 
         multifd_save_cleanup();
         qemu_mutex_lock(&s->qemu_file_lock);
+        file_error = qemu_file_get_error(s->to_dst_file);
         tmp = s->to_dst_file;
         s->to_dst_file = NULL;
         qemu_mutex_unlock(&s->qemu_file_lock);
@@ -1348,7 +1351,7 @@ static void migrate_fd_cleanup(MigrationState *s)
      * We already cleaned up to_dst_file, so errors from the return
      * path might be due to that, ignore them.
      */
-    close_return_path_on_source(s);
+    close_return_path_on_source(s, file_error);
 
     assert(!migration_is_active(s));
 
@@ -2357,7 +2360,7 @@ static int open_return_path_on_source(MigrationState *ms)
 }
 
 /* Return true if error detected, or false otherwise */
-static bool close_return_path_on_source(MigrationState *ms)
+static bool close_return_path_on_source(MigrationState *ms, int file_error)
 {
     if (!ms->rp_state.rp_thread_created) {
         return false;
@@ -2372,8 +2375,7 @@ static bool close_return_path_on_source(MigrationState *ms)
      * cause it to unblock if it's stuck waiting for the destination.
      */
     WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
-        if (ms->to_dst_file && ms->rp_state.from_dst_file &&
-            qemu_file_get_error(ms->to_dst_file)) {
+        if (file_error && ms->rp_state.from_dst_file) {
             qemu_file_shutdown(ms->rp_state.from_dst_file);
         }
     }
@@ -2707,6 +2709,7 @@ static void migration_completion(MigrationState *s)
 {
     int ret = 0;
     int current_active_state = s->state;
+    int file_error = qemu_file_get_error(s->to_dst_file);
 
     if (s->state == MIGRATION_STATUS_ACTIVE) {
         ret = migration_completion_precopy(s, &current_active_state);
@@ -2720,11 +2723,11 @@ static void migration_completion(MigrationState *s)
         goto fail;
     }
 
-    if (close_return_path_on_source(s)) {
+    if (close_return_path_on_source(s, file_error)) {
         goto fail;
     }
 
-    if (qemu_file_get_error(s->to_dst_file)) {
+    if (file_error) {
         trace_migration_completion_file_err();
         goto fail;
     }
@@ -2861,6 +2864,7 @@ static MigThrError postcopy_pause(MigrationState *s)
 
     while (true) {
         QEMUFile *file;
+        int file_error;
 
         /*
          * Current channel is possibly broken. Release it.  Note that this is
@@ -2874,6 +2878,7 @@ static MigThrError postcopy_pause(MigrationState *s)
         assert(s->to_dst_file);
         migration_ioc_unregister_yank_from_file(s->to_dst_file);
         qemu_mutex_lock(&s->qemu_file_lock);
+        file_error = qemu_file_get_error(s->to_dst_file);
         file = s->to_dst_file;
         s->to_dst_file = NULL;
         qemu_mutex_unlock(&s->qemu_file_lock);
@@ -2886,7 +2891,7 @@ static MigThrError postcopy_pause(MigrationState *s)
          * path and just wait for the thread to finish. It will be
          * re-created when we resume.
          */
-        close_return_path_on_source(s);
+        close_return_path_on_source(s, file_error);
 
         migrate_set_state(&s->state, s->state,
                           MIGRATION_STATUS_POSTCOPY_PAUSED);
-- 
2.43.0


Re: [PATCH 1/2] migration: Add a file_error argument to close_return_path_on_source()
Posted by Fabiano Rosas 9 months, 3 weeks ago
Cédric Le Goater <clg@redhat.com> writes:

> close_return_path_on_source() retrieves the migration error from the
> the QEMUFile '->to_dst_file' to know if a shutdown is required to exit
> the return-path thread. However, in migrate_fd_cleanup(), '->to_dst_file'
> is cleaned up before calling close_return_path_on_source() and the
> shutdown is never performed, leaving the source and destination
> waiting for an event to occur.

Isn't this just missing qemu_file_shutdown() at migrate_fd_cleanup?

    if (s->to_dst_file) {
        ...
        migration_ioc_unregister_yank_from_file(tmp);
+       qemu_file_shutdown(tmp);        
        qemu_fclose(tmp);
    }
Re: [PATCH 1/2] migration: Add a file_error argument to close_return_path_on_source()
Posted by Cédric Le Goater 9 months, 3 weeks ago
On 2/2/24 15:30, Fabiano Rosas wrote:
> Cédric Le Goater <clg@redhat.com> writes:
> 
>> close_return_path_on_source() retrieves the migration error from the
>> the QEMUFile '->to_dst_file' to know if a shutdown is required to exit
>> the return-path thread. However, in migrate_fd_cleanup(), '->to_dst_file'
>> is cleaned up before calling close_return_path_on_source() and the
>> shutdown is never performed, leaving the source and destination
>> waiting for an event to occur.
> 
> Isn't this just missing qemu_file_shutdown() at migrate_fd_cleanup?
> 
>      if (s->to_dst_file) {
>          ...
>          migration_ioc_unregister_yank_from_file(tmp);
> +       qemu_file_shutdown(tmp);
>          qemu_fclose(tmp);
>      }
> 

That would make the return-path thread exit indeed. It should not
be necessary when there are no errors though and this is done
outside of the close_return_path_on_source() helper. There could
be side effects.


I took into account Peter's comment and replaced the changes of
PATCH 1 with :

@@ -2372,8 +2372,7 @@ static bool close_return_path_on_source(
       * cause it to unblock if it's stuck waiting for the destination.
       */
      WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
-        if (ms->to_dst_file && ms->rp_state.from_dst_file &&
-            qemu_file_get_error(ms->to_dst_file)) {
+        if (migrate_has_error(ms) && ms->rp_state.from_dst_file) {
              qemu_file_shutdown(ms->rp_state.from_dst_file);
          }
      }

Nevertheless, we need to qemu_file_shutdown() correctly the socket
for this to work and the problem seems more complex than just moving
code as I did in PATCH 2.

Thanks,

C.