[PATCH 2/5] migration/multifd: Remove p->running

Fabiano Rosas posted 5 patches 9 months, 3 weeks ago
Maintainers: Peter Xu <peterx@redhat.com>, Fabiano Rosas <farosas@suse.de>
There is a newer version of this series
[PATCH 2/5] migration/multifd: Remove p->running
Posted by Fabiano Rosas 9 months, 3 weeks ago
We currently only need p->running to avoid calling qemu_thread_join()
on a non existent thread if the thread has never been created.

However, there are at least two bugs in this logic:

1) On the sending side, p->running is set too early and
qemu_thread_create() can be skipped due to an error during TLS
handshake, leaving the flag set and leading to a crash when
multifd_save_cleanup() calls qemu_thread_join().

2) During exit, the multifd thread clears the flag while holding the
channel lock. The counterpart at multifd_save_cleanup() reads the flag
outside of the lock and might free the mutex while the multifd thread
still has it locked.

Fix the first issue by setting the flag right before creating the
thread. Rename it from p->running to p->thread_created to clarify its
usage.

Fix the second issue by not clearing the flag at the multifd thread
exit. We don't have any use for that.

Note that these bugs are straight-forward logic issues and not race
conditions. There is still a gap for races to affect this code due to
multifd_save_cleanup() being allowed to run concurrently with the
thread creation loop. This issue is solved in the next patch.

Fixes: 29647140157a ("migration/tls: add support for multifd tls-handshake")
Reported-by: Avihai Horon <avihaih@nvidia.com>
Reported-by: <chenyuhui5@huawei.com>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
---
 migration/multifd.c | 30 ++++++++++++------------------
 migration/multifd.h |  7 ++-----
 2 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index b557d046a9..402b7fd776 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -528,7 +528,7 @@ void multifd_save_cleanup(void)
             qemu_thread_join(&p->tls_thread);
         }
 
-        if (p->running) {
+        if (p->thread_created) {
             qemu_thread_join(&p->thread);
         }
     }
@@ -759,10 +759,6 @@ out:
         error_free(local_err);
     }
 
-    qemu_mutex_lock(&p->mutex);
-    p->running = false;
-    qemu_mutex_unlock(&p->mutex);
-
     rcu_unregister_thread();
     migration_threads_remove(thread);
     trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages);
@@ -859,6 +855,8 @@ static bool multifd_channel_connect(MultiFDSendParams *p,
     migration_ioc_register_yank(ioc);
     p->registered_yank = true;
     p->c = ioc;
+
+    p->thread_created = true;
     qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
                        QEMU_THREAD_JOINABLE);
     return true;
@@ -890,7 +888,6 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
     trace_multifd_new_send_channel_async(p->id);
     if (!qio_task_propagate_error(task, &local_err)) {
         qio_channel_set_delay(ioc, false);
-        p->running = true;
         if (multifd_channel_connect(p, ioc, &local_err)) {
             return;
         }
@@ -1030,15 +1027,15 @@ void multifd_load_cleanup(void)
     for (i = 0; i < migrate_multifd_channels(); i++) {
         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 
-        if (p->running) {
-            /*
-             * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
-             * however try to wakeup it without harm in cleanup phase.
-             */
-            qemu_sem_post(&p->sem_sync);
-        }
+        /*
+         * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
+         * however try to wakeup it without harm in cleanup phase.
+         */
+        qemu_sem_post(&p->sem_sync);
 
-        qemu_thread_join(&p->thread);
+        if (p->thread_created) {
+            qemu_thread_join(&p->thread);
+        }
     }
     for (i = 0; i < migrate_multifd_channels(); i++) {
         MultiFDRecvParams *p = &multifd_recv_state->params[i];
@@ -1148,9 +1145,6 @@ static void *multifd_recv_thread(void *opaque)
         multifd_recv_terminate_threads(local_err);
         error_free(local_err);
     }
-    qemu_mutex_lock(&p->mutex);
-    p->running = false;
-    qemu_mutex_unlock(&p->mutex);
 
     rcu_unregister_thread();
     trace_multifd_recv_thread_end(p->id, p->num_packets, p->total_normal_pages);
@@ -1258,7 +1252,7 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
     /* initial packet */
     p->num_packets = 1;
 
-    p->running = true;
+    p->thread_created = true;
     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
                        QEMU_THREAD_JOINABLE);
     qatomic_inc(&multifd_recv_state->count);
diff --git a/migration/multifd.h b/migration/multifd.h
index 5a69ef40e2..917833c309 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -73,6 +73,7 @@ typedef struct {
     char *name;
     /* channel thread id */
     QemuThread thread;
+    bool thread_created;
     QemuThread tls_thread;
     bool tls_thread_created;
     /* communication channel */
@@ -95,8 +96,6 @@ typedef struct {
 
     /* this mutex protects the following parameters */
     QemuMutex mutex;
-    /* is this channel thread running */
-    bool running;
     /* should this thread finish */
     bool quit;
     /* multifd flags for each packet */
@@ -144,6 +143,7 @@ typedef struct {
     char *name;
     /* channel thread id */
     QemuThread thread;
+    bool thread_created;
     /* communication channel */
     QIOChannel *c;
     /* packet allocated len */
@@ -158,8 +158,6 @@ typedef struct {
 
     /* this mutex protects the following parameters */
     QemuMutex mutex;
-    /* is this channel thread running */
-    bool running;
     /* should this thread finish */
     bool quit;
     /* multifd flags for each packet */
@@ -209,4 +207,3 @@ typedef struct {
 void multifd_register_ops(int method, MultiFDMethods *ops);
 
 #endif
-
-- 
2.35.3
Re: [PATCH 2/5] migration/multifd: Remove p->running
Posted by Peter Xu 9 months, 3 weeks ago
On Fri, Feb 02, 2024 at 04:11:25PM -0300, Fabiano Rosas wrote:
> We currently only need p->running to avoid calling qemu_thread_join()
> on a non existent thread if the thread has never been created.
> 
> However, there are at least two bugs in this logic:
> 
> 1) On the sending side, p->running is set too early and
> qemu_thread_create() can be skipped due to an error during TLS
> handshake, leaving the flag set and leading to a crash when
> multifd_save_cleanup() calls qemu_thread_join().
> 
> 2) During exit, the multifd thread clears the flag while holding the
> channel lock. The counterpart at multifd_save_cleanup() reads the flag
> outside of the lock and might free the mutex while the multifd thread
> still has it locked.
> 
> Fix the first issue by setting the flag right before creating the
> thread. Rename it from p->running to p->thread_created to clarify its
> usage.
> 
> Fix the second issue by not clearing the flag at the multifd thread
> exit. We don't have any use for that.
> 
> Note that these bugs are straight-forward logic issues and not race
> conditions. There is still a gap for races to affect this code due to
> multifd_save_cleanup() being allowed to run concurrently with the
> thread creation loop. This issue is solved in the next patch.
> 

Cc: qemu-stable <qemu-stable@nongnu.org>

> Fixes: 29647140157a ("migration/tls: add support for multifd tls-handshake")
> Reported-by: Avihai Horon <avihaih@nvidia.com>
> Reported-by: <chenyuhui5@huawei.com>
> Signed-off-by: Fabiano Rosas <farosas@suse.de>

Reviewed-by: Peter Xu <peterx@redhat.com>

-- 
Peter Xu