qemu_file_get_return_path() can not fail (See also commit 3f9d6e77b)
so always open the return path socket on the source. This allows
us to reuse the return path in other parts like colo. Also take
the proper locks in colo while we're at it.
This fixes a crash due to a race between migrate_cancel() and
the colo thread shutting down.
Before, the rp socket is opened just before the rp thread is started
and closed after it terminates and postcopy fast path is closed.
Now it's the same, only the rp socket stays open until migration_cleanup().
If there is a rp thread, the rp socket is shut down at the end of migration,
but the file is still open. COLO is not compatible with postcopy, so this is
safe as no one else uses the rp socket after this point.
Signed-off-by: Lukas Straub <lukasstraub2@web.de>
---
migration/colo.c | 29 ++++++-------------
migration/migration.c | 77 ++++++++++++++++++++++++---------------------------
2 files changed, 44 insertions(+), 62 deletions(-)
diff --git a/migration/colo.c b/migration/colo.c
index ce02c71d8857d470be434bdf3a9cacad3baab0d5..0dee33f1145b81af276cf318e2984deae9ae0527 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -173,11 +173,13 @@ static void primary_vm_do_failover(void)
* The s->rp_state.from_dst_file and s->to_dst_file may use the
* same fd, but we still shutdown the fd for twice, it is harmless.
*/
- if (s->to_dst_file) {
- qemu_file_shutdown(s->to_dst_file);
- }
- if (s->rp_state.from_dst_file) {
- qemu_file_shutdown(s->rp_state.from_dst_file);
+ WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
+ if (s->to_dst_file) {
+ qemu_file_shutdown(s->to_dst_file);
+ }
+ if (s->rp_state.from_dst_file) {
+ qemu_file_shutdown(s->rp_state.from_dst_file);
+ }
}
old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
@@ -537,6 +539,7 @@ static void colo_process_checkpoint(MigrationState *s)
Error *local_err = NULL;
int ret;
+ assert(s->rp_state.from_dst_file);
if (get_colo_mode() != COLO_MODE_PRIMARY) {
error_report("COLO mode must be COLO_MODE_PRIMARY");
return;
@@ -544,12 +547,6 @@ static void colo_process_checkpoint(MigrationState *s)
failover_init_state();
- s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
- if (!s->rp_state.from_dst_file) {
- error_report("Open QEMUFile from_dst_file failed");
- goto out;
- }
-
packets_compare_notifier.notify = colo_compare_notify_checkpoint;
colo_compare_register_notifier(&packets_compare_notifier);
@@ -634,16 +631,6 @@ out:
colo_compare_unregister_notifier(&packets_compare_notifier);
timer_free(s->colo_delay_timer);
qemu_event_destroy(&s->colo_checkpoint_event);
-
- /*
- * Must be called after failover BH is completed,
- * Or the failover BH may shutdown the wrong fd that
- * re-used by other threads after we release here.
- */
- if (s->rp_state.from_dst_file) {
- qemu_fclose(s->rp_state.from_dst_file);
- s->rp_state.from_dst_file = NULL;
- }
}
void migrate_start_colo_process(MigrationState *s)
diff --git a/migration/migration.c b/migration/migration.c
index f36d42ef657bdf26d78ca642d77a9b76e1c0c174..8caa56940beef12de33a799695cf486c8fbd471c 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -97,7 +97,7 @@ static GSList *migration_blockers[MIG_MODE__MAX];
static bool migration_object_check(MigrationState *ms, Error **errp);
static bool migration_switchover_start(MigrationState *s, Error **errp);
-static bool close_return_path_on_source(MigrationState *s);
+static bool stop_return_path_thread_on_source(MigrationState *s);
static void migration_completion_end(MigrationState *s);
static void migration_downtime_start(MigrationState *s)
@@ -1278,7 +1278,7 @@ static void migration_cleanup(MigrationState *s)
cpr_state_close();
cpr_transfer_source_destroy(s);
- close_return_path_on_source(s);
+ stop_return_path_thread_on_source(s);
if (s->migration_thread_running) {
bql_unlock();
@@ -1307,6 +1307,14 @@ static void migration_cleanup(MigrationState *s)
qemu_fclose(tmp);
}
+ WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
+ tmp = s->rp_state.from_dst_file;
+ s->rp_state.from_dst_file = NULL;
+ }
+ if (tmp) {
+ qemu_fclose(tmp);
+ }
+
assert(!migration_is_active());
if (s->state == MIGRATION_STATUS_CANCELLING) {
@@ -2187,38 +2195,6 @@ static bool migrate_handle_rp_resume_ack(MigrationState *s,
return true;
}
-/*
- * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
- * existed) in a safe way.
- */
-static void migration_release_dst_files(MigrationState *ms)
-{
- QEMUFile *file = NULL;
-
- WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
- /*
- * Reset the from_dst_file pointer first before releasing it, as we
- * can't block within lock section
- */
- file = ms->rp_state.from_dst_file;
- ms->rp_state.from_dst_file = NULL;
- }
-
- /*
- * Do the same to postcopy fast path socket too if there is. No
- * locking needed because this qemufile should only be managed by
- * return path thread.
- */
- if (ms->postcopy_qemufile_src) {
- migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
- qemu_file_shutdown(ms->postcopy_qemufile_src);
- qemu_fclose(ms->postcopy_qemufile_src);
- ms->postcopy_qemufile_src = NULL;
- }
-
- qemu_fclose(file);
-}
-
/*
* Handles messages sent on the return path towards the source VM
*
@@ -2388,9 +2364,9 @@ out:
return NULL;
}
-static void open_return_path_on_source(MigrationState *ms)
+static void start_return_path_thread_on_source(MigrationState *ms)
{
- ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
+ assert(ms->rp_state.from_dst_file);
trace_open_return_path_on_source();
@@ -2402,7 +2378,7 @@ static void open_return_path_on_source(MigrationState *ms)
}
/* Return true if error detected, or false otherwise */
-static bool close_return_path_on_source(MigrationState *ms)
+static bool stop_return_path_thread_on_source(MigrationState *ms)
{
if (!ms->rp_state.rp_thread_created) {
return false;
@@ -2424,7 +2400,17 @@ static bool close_return_path_on_source(MigrationState *ms)
qemu_thread_join(&ms->rp_state.rp_thread);
ms->rp_state.rp_thread_created = false;
- migration_release_dst_files(ms);
+ /*
+ * Close the postcopy fast path socket if there is one.
+ * No locking needed because this qemufile should only be managed by
+ * return path thread which we just stopped.
+ */
+ if (ms->postcopy_qemufile_src) {
+ migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
+ qemu_file_shutdown(ms->postcopy_qemufile_src);
+ qemu_fclose(ms->postcopy_qemufile_src);
+ ms->postcopy_qemufile_src = NULL;
+ }
trace_migration_return_path_end_after();
/* Return path will persist the error in MigrationState when quit */
@@ -2787,7 +2773,7 @@ static void migration_completion(MigrationState *s)
goto fail;
}
- if (close_return_path_on_source(s)) {
+ if (stop_return_path_thread_on_source(s)) {
goto fail;
}
@@ -2941,7 +2927,15 @@ static MigThrError postcopy_pause(MigrationState *s)
* path and just wait for the thread to finish. It will be
* re-created when we resume.
*/
- close_return_path_on_source(s);
+ stop_return_path_thread_on_source(s);
+ QEMUFile *rp_file;
+ WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
+ rp_file = s->rp_state.from_dst_file;
+ s->rp_state.from_dst_file = NULL;
+ }
+ if (rp_file) {
+ qemu_fclose(rp_file);
+ }
/*
* Current channel is possibly broken. Release it. Note that this is
@@ -3758,6 +3752,7 @@ void migration_start_outgoing(MigrationState *s)
if (!qemu_file_set_blocking(s->to_dst_file, true, &local_err)) {
goto fail;
}
+ s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
/*
* Open the return path. For postcopy, it is used exclusively. For
@@ -3765,7 +3760,7 @@ void migration_start_outgoing(MigrationState *s)
* QEMU uses the return path.
*/
if (migrate_postcopy_ram() || migrate_return_path()) {
- open_return_path_on_source(s);
+ start_return_path_thread_on_source(s);
}
/*
--
2.39.5
On Fri, Feb 20, 2026 at 08:51:41PM +0100, Lukas Straub wrote:
> qemu_file_get_return_path() can not fail (See also commit 3f9d6e77b)
> so always open the return path socket on the source. This allows
> us to reuse the return path in other parts like colo. Also take
> the proper locks in colo while we're at it.
>
> This fixes a crash due to a race between migrate_cancel() and
> the colo thread shutting down.
>
> Before, the rp socket is opened just before the rp thread is started
> and closed after it terminates and postcopy fast path is closed.
> Now it's the same, only the rp socket stays open until migration_cleanup().
>
> If there is a rp thread, the rp socket is shut down at the end of migration,
> but the file is still open. COLO is not compatible with postcopy, so this is
> safe as no one else uses the rp socket after this point.
>
> Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> ---
> migration/colo.c | 29 ++++++-------------
> migration/migration.c | 77 ++++++++++++++++++++++++---------------------------
> 2 files changed, 44 insertions(+), 62 deletions(-)
>
> diff --git a/migration/colo.c b/migration/colo.c
> index ce02c71d8857d470be434bdf3a9cacad3baab0d5..0dee33f1145b81af276cf318e2984deae9ae0527 100644
> --- a/migration/colo.c
> +++ b/migration/colo.c
> @@ -173,11 +173,13 @@ static void primary_vm_do_failover(void)
> * The s->rp_state.from_dst_file and s->to_dst_file may use the
> * same fd, but we still shutdown the fd for twice, it is harmless.
> */
> - if (s->to_dst_file) {
> - qemu_file_shutdown(s->to_dst_file);
> - }
> - if (s->rp_state.from_dst_file) {
> - qemu_file_shutdown(s->rp_state.from_dst_file);
> + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> + if (s->to_dst_file) {
> + qemu_file_shutdown(s->to_dst_file);
> + }
> + if (s->rp_state.from_dst_file) {
> + qemu_file_shutdown(s->rp_state.from_dst_file);
> + }
Nit: these "start to take mutex for..." changes should belong to a separate
patch.
> }
>
> old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
> @@ -537,6 +539,7 @@ static void colo_process_checkpoint(MigrationState *s)
> Error *local_err = NULL;
> int ret;
>
> + assert(s->rp_state.from_dst_file);
> if (get_colo_mode() != COLO_MODE_PRIMARY) {
> error_report("COLO mode must be COLO_MODE_PRIMARY");
> return;
> @@ -544,12 +547,6 @@ static void colo_process_checkpoint(MigrationState *s)
>
> failover_init_state();
>
> - s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> - if (!s->rp_state.from_dst_file) {
> - error_report("Open QEMUFile from_dst_file failed");
> - goto out;
> - }
> -
> packets_compare_notifier.notify = colo_compare_notify_checkpoint;
> colo_compare_register_notifier(&packets_compare_notifier);
>
> @@ -634,16 +631,6 @@ out:
> colo_compare_unregister_notifier(&packets_compare_notifier);
> timer_free(s->colo_delay_timer);
> qemu_event_destroy(&s->colo_checkpoint_event);
> -
> - /*
> - * Must be called after failover BH is completed,
> - * Or the failover BH may shutdown the wrong fd that
> - * re-used by other threads after we release here.
> - */
> - if (s->rp_state.from_dst_file) {
> - qemu_fclose(s->rp_state.from_dst_file);
> - s->rp_state.from_dst_file = NULL;
> - }
> }
>
> void migrate_start_colo_process(MigrationState *s)
> diff --git a/migration/migration.c b/migration/migration.c
> index f36d42ef657bdf26d78ca642d77a9b76e1c0c174..8caa56940beef12de33a799695cf486c8fbd471c 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -97,7 +97,7 @@ static GSList *migration_blockers[MIG_MODE__MAX];
>
> static bool migration_object_check(MigrationState *ms, Error **errp);
> static bool migration_switchover_start(MigrationState *s, Error **errp);
> -static bool close_return_path_on_source(MigrationState *s);
> +static bool stop_return_path_thread_on_source(MigrationState *s);
> static void migration_completion_end(MigrationState *s);
>
> static void migration_downtime_start(MigrationState *s)
> @@ -1278,7 +1278,7 @@ static void migration_cleanup(MigrationState *s)
> cpr_state_close();
> cpr_transfer_source_destroy(s);
>
> - close_return_path_on_source(s);
> + stop_return_path_thread_on_source(s);
>
> if (s->migration_thread_running) {
> bql_unlock();
> @@ -1307,6 +1307,14 @@ static void migration_cleanup(MigrationState *s)
> qemu_fclose(tmp);
> }
>
> + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> + tmp = s->rp_state.from_dst_file;
> + s->rp_state.from_dst_file = NULL;
> + }
> + if (tmp) {
> + qemu_fclose(tmp);
> + }
> +
> assert(!migration_is_active());
>
> if (s->state == MIGRATION_STATUS_CANCELLING) {
> @@ -2187,38 +2195,6 @@ static bool migrate_handle_rp_resume_ack(MigrationState *s,
> return true;
> }
>
> -/*
> - * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
> - * existed) in a safe way.
> - */
> -static void migration_release_dst_files(MigrationState *ms)
> -{
> - QEMUFile *file = NULL;
> -
> - WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
> - /*
> - * Reset the from_dst_file pointer first before releasing it, as we
> - * can't block within lock section
> - */
> - file = ms->rp_state.from_dst_file;
> - ms->rp_state.from_dst_file = NULL;
> - }
> -
> - /*
> - * Do the same to postcopy fast path socket too if there is. No
> - * locking needed because this qemufile should only be managed by
> - * return path thread.
> - */
> - if (ms->postcopy_qemufile_src) {
> - migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> - qemu_file_shutdown(ms->postcopy_qemufile_src);
> - qemu_fclose(ms->postcopy_qemufile_src);
> - ms->postcopy_qemufile_src = NULL;
> - }
> -
> - qemu_fclose(file);
> -}
> -
> /*
> * Handles messages sent on the return path towards the source VM
> *
> @@ -2388,9 +2364,9 @@ out:
> return NULL;
> }
>
> -static void open_return_path_on_source(MigrationState *ms)
> +static void start_return_path_thread_on_source(MigrationState *ms)
Changing this seems OK, but I'm totally confused why you deleted
migration_release_dst_files(). That's the helper to properly close the two
possible qemufiles that rp thread uses. IIUC you can keep it then use it
in either postcopy_pause() or migration_cleanup() directly.
Then you need to remove the chunk [1] below, making the function only do
"stop" but not close.
> {
> - ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
> + assert(ms->rp_state.from_dst_file);
I don't see why this change is a must, to open from_dst_file earlier. Can
we keep it as before, or would you justify it in a separate patch?
>
> trace_open_return_path_on_source();
>
> @@ -2402,7 +2378,7 @@ static void open_return_path_on_source(MigrationState *ms)
> }
>
> /* Return true if error detected, or false otherwise */
> -static bool close_return_path_on_source(MigrationState *ms)
> +static bool stop_return_path_thread_on_source(MigrationState *ms)
> {
> if (!ms->rp_state.rp_thread_created) {
> return false;
> @@ -2424,7 +2400,17 @@ static bool close_return_path_on_source(MigrationState *ms)
>
> qemu_thread_join(&ms->rp_state.rp_thread);
> ms->rp_state.rp_thread_created = false;
> - migration_release_dst_files(ms);
> + /*
> + * Close the postcopy fast path socket if there is one.
> + * No locking needed because this qemufile should only be managed by
> + * return path thread which we just stopped.
> + */
> + if (ms->postcopy_qemufile_src) {
> + migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> + qemu_file_shutdown(ms->postcopy_qemufile_src);
> + qemu_fclose(ms->postcopy_qemufile_src);
> + ms->postcopy_qemufile_src = NULL;
> + }
[1]
> trace_migration_return_path_end_after();
>
> /* Return path will persist the error in MigrationState when quit */
> @@ -2787,7 +2773,7 @@ static void migration_completion(MigrationState *s)
> goto fail;
> }
>
> - if (close_return_path_on_source(s)) {
> + if (stop_return_path_thread_on_source(s)) {
> goto fail;
> }
>
> @@ -2941,7 +2927,15 @@ static MigThrError postcopy_pause(MigrationState *s)
> * path and just wait for the thread to finish. It will be
> * re-created when we resume.
> */
> - close_return_path_on_source(s);
> + stop_return_path_thread_on_source(s);
> + QEMUFile *rp_file;
> + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> + rp_file = s->rp_state.from_dst_file;
> + s->rp_state.from_dst_file = NULL;
> + }
> + if (rp_file) {
> + qemu_fclose(rp_file);
> + }
Open-code this is going backwards. Please see if we can reuse
migration_release_dst_files() at least.
Thanks,
>
> /*
> * Current channel is possibly broken. Release it. Note that this is
> @@ -3758,6 +3752,7 @@ void migration_start_outgoing(MigrationState *s)
> if (!qemu_file_set_blocking(s->to_dst_file, true, &local_err)) {
> goto fail;
> }
> + s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
>
> /*
> * Open the return path. For postcopy, it is used exclusively. For
> @@ -3765,7 +3760,7 @@ void migration_start_outgoing(MigrationState *s)
> * QEMU uses the return path.
> */
> if (migrate_postcopy_ram() || migrate_return_path()) {
> - open_return_path_on_source(s);
> + start_return_path_thread_on_source(s);
> }
>
> /*
>
> --
> 2.39.5
>
--
Peter Xu
On Thu, 26 Feb 2026 10:49:07 -0500
Peter Xu <peterx@redhat.com> wrote:
> On Fri, Feb 20, 2026 at 08:51:41PM +0100, Lukas Straub wrote:
> > qemu_file_get_return_path() can not fail (See also commit 3f9d6e77b)
> > so always open the return path socket on the source. This allows
> > us to reuse the return path in other parts like colo. Also take
> > the proper locks in colo while we're at it.
> >
> > This fixes a crash due to a race between migrate_cancel() and
> > the colo thread shutting down.
> >
> > Before, the rp socket is opened just before the rp thread is started
> > and closed after it terminates and postcopy fast path is closed.
> > Now it's the same, only the rp socket stays open until migration_cleanup().
> >
> > If there is a rp thread, the rp socket is shut down at the end of migration,
> > but the file is still open. COLO is not compatible with postcopy, so this is
> > safe as no one else uses the rp socket after this point.
> >
> > Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> > ---
> > migration/colo.c | 29 ++++++-------------
> > migration/migration.c | 77 ++++++++++++++++++++++++---------------------------
> > 2 files changed, 44 insertions(+), 62 deletions(-)
> >
> > diff --git a/migration/colo.c b/migration/colo.c
> > index ce02c71d8857d470be434bdf3a9cacad3baab0d5..0dee33f1145b81af276cf318e2984deae9ae0527 100644
> > --- a/migration/colo.c
> > +++ b/migration/colo.c
> > @@ -173,11 +173,13 @@ static void primary_vm_do_failover(void)
> > * The s->rp_state.from_dst_file and s->to_dst_file may use the
> > * same fd, but we still shutdown the fd for twice, it is harmless.
> > */
> > - if (s->to_dst_file) {
> > - qemu_file_shutdown(s->to_dst_file);
> > - }
> > - if (s->rp_state.from_dst_file) {
> > - qemu_file_shutdown(s->rp_state.from_dst_file);
> > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > + if (s->to_dst_file) {
> > + qemu_file_shutdown(s->to_dst_file);
> > + }
> > + if (s->rp_state.from_dst_file) {
> > + qemu_file_shutdown(s->rp_state.from_dst_file);
> > + }
>
> Nit: these "start to take mutex for..." changes should belong to a separate
> patch.
>
Will do.
> > }
> >
> > old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
> > @@ -537,6 +539,7 @@ static void colo_process_checkpoint(MigrationState *s)
> > Error *local_err = NULL;
> > int ret;
> >
> > + assert(s->rp_state.from_dst_file);
> > if (get_colo_mode() != COLO_MODE_PRIMARY) {
> > error_report("COLO mode must be COLO_MODE_PRIMARY");
> > return;
> > @@ -544,12 +547,6 @@ static void colo_process_checkpoint(MigrationState *s)
> >
> > failover_init_state();
> >
> > - s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> > - if (!s->rp_state.from_dst_file) {
> > - error_report("Open QEMUFile from_dst_file failed");
> > - goto out;
> > - }
> > -
> > packets_compare_notifier.notify = colo_compare_notify_checkpoint;
> > colo_compare_register_notifier(&packets_compare_notifier);
> >
> > @@ -634,16 +631,6 @@ out:
> > colo_compare_unregister_notifier(&packets_compare_notifier);
> > timer_free(s->colo_delay_timer);
> > qemu_event_destroy(&s->colo_checkpoint_event);
> > -
> > - /*
> > - * Must be called after failover BH is completed,
> > - * Or the failover BH may shutdown the wrong fd that
> > - * re-used by other threads after we release here.
> > - */
> > - if (s->rp_state.from_dst_file) {
> > - qemu_fclose(s->rp_state.from_dst_file);
> > - s->rp_state.from_dst_file = NULL;
> > - }
> > }
> >
> > void migrate_start_colo_process(MigrationState *s)
> > diff --git a/migration/migration.c b/migration/migration.c
> > index f36d42ef657bdf26d78ca642d77a9b76e1c0c174..8caa56940beef12de33a799695cf486c8fbd471c 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -97,7 +97,7 @@ static GSList *migration_blockers[MIG_MODE__MAX];
> >
> > static bool migration_object_check(MigrationState *ms, Error **errp);
> > static bool migration_switchover_start(MigrationState *s, Error **errp);
> > -static bool close_return_path_on_source(MigrationState *s);
> > +static bool stop_return_path_thread_on_source(MigrationState *s);
> > static void migration_completion_end(MigrationState *s);
> >
> > static void migration_downtime_start(MigrationState *s)
> > @@ -1278,7 +1278,7 @@ static void migration_cleanup(MigrationState *s)
> > cpr_state_close();
> > cpr_transfer_source_destroy(s);
> >
> > - close_return_path_on_source(s);
> > + stop_return_path_thread_on_source(s);
> >
> > if (s->migration_thread_running) {
> > bql_unlock();
> > @@ -1307,6 +1307,14 @@ static void migration_cleanup(MigrationState *s)
> > qemu_fclose(tmp);
> > }
> >
> > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > + tmp = s->rp_state.from_dst_file;
> > + s->rp_state.from_dst_file = NULL;
> > + }
> > + if (tmp) {
> > + qemu_fclose(tmp);
> > + }
> > +
> > assert(!migration_is_active());
> >
> > if (s->state == MIGRATION_STATUS_CANCELLING) {
> > @@ -2187,38 +2195,6 @@ static bool migrate_handle_rp_resume_ack(MigrationState *s,
> > return true;
> > }
> >
> > -/*
> > - * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
> > - * existed) in a safe way.
> > - */
> > -static void migration_release_dst_files(MigrationState *ms)
> > -{
> > - QEMUFile *file = NULL;
> > -
> > - WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
> > - /*
> > - * Reset the from_dst_file pointer first before releasing it, as we
> > - * can't block within lock section
> > - */
> > - file = ms->rp_state.from_dst_file;
> > - ms->rp_state.from_dst_file = NULL;
> > - }
> > -
> > - /*
> > - * Do the same to postcopy fast path socket too if there is. No
> > - * locking needed because this qemufile should only be managed by
> > - * return path thread.
> > - */
> > - if (ms->postcopy_qemufile_src) {
> > - migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> > - qemu_file_shutdown(ms->postcopy_qemufile_src);
> > - qemu_fclose(ms->postcopy_qemufile_src);
> > - ms->postcopy_qemufile_src = NULL;
> > - }
> > -
> > - qemu_fclose(file);
> > -}
> > -
> > /*
> > * Handles messages sent on the return path towards the source VM
> > *
> > @@ -2388,9 +2364,9 @@ out:
> > return NULL;
> > }
> >
> > -static void open_return_path_on_source(MigrationState *ms)
> > +static void start_return_path_thread_on_source(MigrationState *ms)
>
> Changing this seems OK, but I'm totally confused why you deleted
> migration_release_dst_files(). That's the helper to properly close the two
> possible qemufiles that rp thread uses.
Okay, so my reasoning here is:
I think that closing ms->postcopy_qemufile_src is very closely
related to cleaning up the rp thread so I moved that to
stop_return_path_thread_on_source().
Meanwhile I think s->rp_state.from_dst_file is more closely related to
s->to_dst_file and in fact I think we can remove s->rp_state.from_dst_file
entirely in the future and use s->to_dst_file for send *and* receive
just like you would with a normal socket.
So I close s->rp_state.from_dst_file right besides s->to_dst_file in
this patch. And I do the same open-coded file lock dance like
s->to_dst_file already does.
> IIUC you can keep it then use it
> in either postcopy_pause() or migration_cleanup() directly.
I can do that if you wish.
Should I keep closing ms->postcopy_qemufile_src inside
migration_release_dst_files() or stop_return_path_thread_on_source() ?
>
> Then you need to remove the chunk [1] below, making the function only do
> "stop" but not close.
>
> > {
> > - ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
> > + assert(ms->rp_state.from_dst_file);
>
> I don't see why this change is a must, to open from_dst_file earlier. Can
> we keep it as before, or would you justify it in a separate patch?
Well, the issue is that opening the return path file is behind this if:
if (migrate_postcopy_ram() || migrate_return_path()) {
- open_return_path_on_source(s);
+ start_return_path_thread_on_source(s);
}
And I want to always open the return path file, without also starting
the return path thread.
>
> >
> > trace_open_return_path_on_source();
> >
> > @@ -2402,7 +2378,7 @@ static void open_return_path_on_source(MigrationState *ms)
> > }
> >
> > /* Return true if error detected, or false otherwise */
> > -static bool close_return_path_on_source(MigrationState *ms)
> > +static bool stop_return_path_thread_on_source(MigrationState *ms)
> > {
> > if (!ms->rp_state.rp_thread_created) {
> > return false;
> > @@ -2424,7 +2400,17 @@ static bool close_return_path_on_source(MigrationState *ms)
> >
> > qemu_thread_join(&ms->rp_state.rp_thread);
> > ms->rp_state.rp_thread_created = false;
> > - migration_release_dst_files(ms);
> > + /*
> > + * Close the postcopy fast path socket if there is one.
> > + * No locking needed because this qemufile should only be managed by
> > + * return path thread which we just stopped.
> > + */
> > + if (ms->postcopy_qemufile_src) {
> > + migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> > + qemu_file_shutdown(ms->postcopy_qemufile_src);
> > + qemu_fclose(ms->postcopy_qemufile_src);
> > + ms->postcopy_qemufile_src = NULL;
> > + }
>
> [1]
>
> > trace_migration_return_path_end_after();
> >
> > /* Return path will persist the error in MigrationState when quit */
> > @@ -2787,7 +2773,7 @@ static void migration_completion(MigrationState *s)
> > goto fail;
> > }
> >
> > - if (close_return_path_on_source(s)) {
> > + if (stop_return_path_thread_on_source(s)) {
> > goto fail;
> > }
> >
> > @@ -2941,7 +2927,15 @@ static MigThrError postcopy_pause(MigrationState *s)
> > * path and just wait for the thread to finish. It will be
> > * re-created when we resume.
> > */
> > - close_return_path_on_source(s);
> > + stop_return_path_thread_on_source(s);
> > + QEMUFile *rp_file;
> > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > + rp_file = s->rp_state.from_dst_file;
> > + s->rp_state.from_dst_file = NULL;
> > + }
> > + if (rp_file) {
> > + qemu_fclose(rp_file);
> > + }
>
> Open-code this is going backwards. Please see if we can reuse
> migration_release_dst_files() at least.
>
> Thanks,
>
> >
> > /*
> > * Current channel is possibly broken. Release it. Note that this is
> > @@ -3758,6 +3752,7 @@ void migration_start_outgoing(MigrationState *s)
> > if (!qemu_file_set_blocking(s->to_dst_file, true, &local_err)) {
> > goto fail;
> > }
> > + s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> >
> > /*
> > * Open the return path. For postcopy, it is used exclusively. For
> > @@ -3765,7 +3760,7 @@ void migration_start_outgoing(MigrationState *s)
> > * QEMU uses the return path.
> > */
> > if (migrate_postcopy_ram() || migrate_return_path()) {
> > - open_return_path_on_source(s);
> > + start_return_path_thread_on_source(s);
> > }
> >
> >
> >
> > /*
> >
> > --
> > 2.39.5
> >
>
On Fri, Feb 27, 2026 at 12:49:03PM +0100, Lukas Straub wrote:
> On Thu, 26 Feb 2026 10:49:07 -0500
> Peter Xu <peterx@redhat.com> wrote:
>
> > On Fri, Feb 20, 2026 at 08:51:41PM +0100, Lukas Straub wrote:
> > > qemu_file_get_return_path() can not fail (See also commit 3f9d6e77b)
> > > so always open the return path socket on the source. This allows
> > > us to reuse the return path in other parts like colo. Also take
> > > the proper locks in colo while we're at it.
> > >
> > > This fixes a crash due to a race between migrate_cancel() and
> > > the colo thread shutting down.
> > >
> > > Before, the rp socket is opened just before the rp thread is started
> > > and closed after it terminates and postcopy fast path is closed.
> > > Now it's the same, only the rp socket stays open until migration_cleanup().
> > >
> > > If there is a rp thread, the rp socket is shut down at the end of migration,
> > > but the file is still open. COLO is not compatible with postcopy, so this is
> > > safe as no one else uses the rp socket after this point.
> > >
> > > Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> > > ---
> > > migration/colo.c | 29 ++++++-------------
> > > migration/migration.c | 77 ++++++++++++++++++++++++---------------------------
> > > 2 files changed, 44 insertions(+), 62 deletions(-)
> > >
> > > diff --git a/migration/colo.c b/migration/colo.c
> > > index ce02c71d8857d470be434bdf3a9cacad3baab0d5..0dee33f1145b81af276cf318e2984deae9ae0527 100644
> > > --- a/migration/colo.c
> > > +++ b/migration/colo.c
> > > @@ -173,11 +173,13 @@ static void primary_vm_do_failover(void)
> > > * The s->rp_state.from_dst_file and s->to_dst_file may use the
> > > * same fd, but we still shutdown the fd for twice, it is harmless.
> > > */
> > > - if (s->to_dst_file) {
> > > - qemu_file_shutdown(s->to_dst_file);
> > > - }
> > > - if (s->rp_state.from_dst_file) {
> > > - qemu_file_shutdown(s->rp_state.from_dst_file);
> > > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > > + if (s->to_dst_file) {
> > > + qemu_file_shutdown(s->to_dst_file);
> > > + }
> > > + if (s->rp_state.from_dst_file) {
> > > + qemu_file_shutdown(s->rp_state.from_dst_file);
> > > + }
> >
> > Nit: these "start to take mutex for..." changes should belong to a separate
> > patch.
> >
>
> Will do.
>
> > > }
> > >
> > > old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
> > > @@ -537,6 +539,7 @@ static void colo_process_checkpoint(MigrationState *s)
> > > Error *local_err = NULL;
> > > int ret;
> > >
> > > + assert(s->rp_state.from_dst_file);
> > > if (get_colo_mode() != COLO_MODE_PRIMARY) {
> > > error_report("COLO mode must be COLO_MODE_PRIMARY");
> > > return;
> > > @@ -544,12 +547,6 @@ static void colo_process_checkpoint(MigrationState *s)
> > >
> > > failover_init_state();
> > >
> > > - s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> > > - if (!s->rp_state.from_dst_file) {
> > > - error_report("Open QEMUFile from_dst_file failed");
> > > - goto out;
> > > - }
> > > -
> > > packets_compare_notifier.notify = colo_compare_notify_checkpoint;
> > > colo_compare_register_notifier(&packets_compare_notifier);
> > >
> > > @@ -634,16 +631,6 @@ out:
> > > colo_compare_unregister_notifier(&packets_compare_notifier);
> > > timer_free(s->colo_delay_timer);
> > > qemu_event_destroy(&s->colo_checkpoint_event);
> > > -
> > > - /*
> > > - * Must be called after failover BH is completed,
> > > - * Or the failover BH may shutdown the wrong fd that
> > > - * re-used by other threads after we release here.
> > > - */
> > > - if (s->rp_state.from_dst_file) {
> > > - qemu_fclose(s->rp_state.from_dst_file);
> > > - s->rp_state.from_dst_file = NULL;
> > > - }
> > > }
> > >
> > > void migrate_start_colo_process(MigrationState *s)
> > > diff --git a/migration/migration.c b/migration/migration.c
> > > index f36d42ef657bdf26d78ca642d77a9b76e1c0c174..8caa56940beef12de33a799695cf486c8fbd471c 100644
> > > --- a/migration/migration.c
> > > +++ b/migration/migration.c
> > > @@ -97,7 +97,7 @@ static GSList *migration_blockers[MIG_MODE__MAX];
> > >
> > > static bool migration_object_check(MigrationState *ms, Error **errp);
> > > static bool migration_switchover_start(MigrationState *s, Error **errp);
> > > -static bool close_return_path_on_source(MigrationState *s);
> > > +static bool stop_return_path_thread_on_source(MigrationState *s);
> > > static void migration_completion_end(MigrationState *s);
> > >
> > > static void migration_downtime_start(MigrationState *s)
> > > @@ -1278,7 +1278,7 @@ static void migration_cleanup(MigrationState *s)
> > > cpr_state_close();
> > > cpr_transfer_source_destroy(s);
> > >
> > > - close_return_path_on_source(s);
> > > + stop_return_path_thread_on_source(s);
> > >
> > > if (s->migration_thread_running) {
> > > bql_unlock();
> > > @@ -1307,6 +1307,14 @@ static void migration_cleanup(MigrationState *s)
> > > qemu_fclose(tmp);
> > > }
> > >
> > > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > > + tmp = s->rp_state.from_dst_file;
> > > + s->rp_state.from_dst_file = NULL;
> > > + }
> > > + if (tmp) {
> > > + qemu_fclose(tmp);
> > > + }
> > > +
> > > assert(!migration_is_active());
> > >
> > > if (s->state == MIGRATION_STATUS_CANCELLING) {
> > > @@ -2187,38 +2195,6 @@ static bool migrate_handle_rp_resume_ack(MigrationState *s,
> > > return true;
> > > }
> > >
> > > -/*
> > > - * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
> > > - * existed) in a safe way.
> > > - */
> > > -static void migration_release_dst_files(MigrationState *ms)
> > > -{
> > > - QEMUFile *file = NULL;
> > > -
> > > - WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
> > > - /*
> > > - * Reset the from_dst_file pointer first before releasing it, as we
> > > - * can't block within lock section
> > > - */
> > > - file = ms->rp_state.from_dst_file;
> > > - ms->rp_state.from_dst_file = NULL;
> > > - }
> > > -
> > > - /*
> > > - * Do the same to postcopy fast path socket too if there is. No
> > > - * locking needed because this qemufile should only be managed by
> > > - * return path thread.
> > > - */
> > > - if (ms->postcopy_qemufile_src) {
> > > - migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> > > - qemu_file_shutdown(ms->postcopy_qemufile_src);
> > > - qemu_fclose(ms->postcopy_qemufile_src);
> > > - ms->postcopy_qemufile_src = NULL;
> > > - }
> > > -
> > > - qemu_fclose(file);
> > > -}
> > > -
> > > /*
> > > * Handles messages sent on the return path towards the source VM
> > > *
> > > @@ -2388,9 +2364,9 @@ out:
> > > return NULL;
> > > }
> > >
> > > -static void open_return_path_on_source(MigrationState *ms)
> > > +static void start_return_path_thread_on_source(MigrationState *ms)
> >
> > Changing this seems OK, but I'm totally confused why you deleted
> > migration_release_dst_files(). That's the helper to properly close the two
> > possible qemufiles that rp thread uses.
>
> Okay, so my reasoning here is:
> I think that closing ms->postcopy_qemufile_src is very closely
> related to cleaning up the rp thread so I moved that to
> stop_return_path_thread_on_source().
Hmm OK, I had that feeling you wanted to move all qemufile cleanups into
the cleanup function.
Actually I think that may still be easier for you, e.g. you can at least
reuse the migration_release_dst_files(). I don't see much benefit yet on
open code the preempt qemufiles..
>
> Meanwhile I think s->rp_state.from_dst_file is more closely related to
> s->to_dst_file and in fact I think we can remove s->rp_state.from_dst_file
> entirely in the future and use s->to_dst_file for send *and* receive
> just like you would with a normal socket.
>
> So I close s->rp_state.from_dst_file right besides s->to_dst_file in
> this patch. And I do the same open-coded file lock dance like
> s->to_dst_file already does.
Let's not do open coded things. That makes things worse.
If you also agree we can cleanup qemufile alawys only until migration
cleanup then we can stick with it for now.
>
> > IIUC you can keep it then use it
> > in either postcopy_pause() or migration_cleanup() directly.
>
> I can do that if you wish.
> Should I keep closing ms->postcopy_qemufile_src inside
> migration_release_dst_files() or stop_return_path_thread_on_source() ?
If you use migration_release_dst_files() in both (1) postcopy pause and (2)
migration cleanup, IIUC we should covered all cases. But please double
check.
>
> >
> > Then you need to remove the chunk [1] below, making the function only do
> > "stop" but not close.
> >
> > > {
> > > - ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
> > > + assert(ms->rp_state.from_dst_file);
> >
> > I don't see why this change is a must, to open from_dst_file earlier. Can
> > we keep it as before, or would you justify it in a separate patch?
>
> Well, the issue is that opening the return path file is behind this if:
>
> if (migrate_postcopy_ram() || migrate_return_path()) {
> - open_return_path_on_source(s);
> + start_return_path_thread_on_source(s);
> }
>
> And I want to always open the return path file, without also starting
> the return path thread.
I never notice this small difference.. then logically COLO should just
depend on return-path capability.
IIUC the simplest for your series to move on is:
- If COLO always require return-path (I hope you know the best... e.g. do
you enable return-path cap for your colo deployment?), we can add that
enforcement in migrate_caps_check(). It's an ABI break only to COLO,
but if you're OK, I don't see it an issue.
- Just add one more condition above into:
if (migrate_postcopy_ram() || migrate_return_path() || migrate_colo()) {
open_return_path_on_source(s);
}
Instead of making rp complicated; after all many places assume rp thread
is there when rp qemufile is there, vice versa. Changing that needs more
monitoring of code base, IMHO. Better stick with it to be simple.
>
> >
> > >
> > > trace_open_return_path_on_source();
> > >
> > > @@ -2402,7 +2378,7 @@ static void open_return_path_on_source(MigrationState *ms)
> > > }
> > >
> > > /* Return true if error detected, or false otherwise */
> > > -static bool close_return_path_on_source(MigrationState *ms)
> > > +static bool stop_return_path_thread_on_source(MigrationState *ms)
> > > {
> > > if (!ms->rp_state.rp_thread_created) {
> > > return false;
> > > @@ -2424,7 +2400,17 @@ static bool close_return_path_on_source(MigrationState *ms)
> > >
> > > qemu_thread_join(&ms->rp_state.rp_thread);
> > > ms->rp_state.rp_thread_created = false;
> > > - migration_release_dst_files(ms);
> > > + /*
> > > + * Close the postcopy fast path socket if there is one.
> > > + * No locking needed because this qemufile should only be managed by
> > > + * return path thread which we just stopped.
> > > + */
> > > + if (ms->postcopy_qemufile_src) {
> > > + migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> > > + qemu_file_shutdown(ms->postcopy_qemufile_src);
> > > + qemu_fclose(ms->postcopy_qemufile_src);
> > > + ms->postcopy_qemufile_src = NULL;
> > > + }
> >
> > [1]
> >
> > > trace_migration_return_path_end_after();
> > >
> > > /* Return path will persist the error in MigrationState when quit */
> > > @@ -2787,7 +2773,7 @@ static void migration_completion(MigrationState *s)
> > > goto fail;
> > > }
> > >
> > > - if (close_return_path_on_source(s)) {
> > > + if (stop_return_path_thread_on_source(s)) {
> > > goto fail;
> > > }
> > >
> > > @@ -2941,7 +2927,15 @@ static MigThrError postcopy_pause(MigrationState *s)
> > > * path and just wait for the thread to finish. It will be
> > > * re-created when we resume.
> > > */
> > > - close_return_path_on_source(s);
> > > + stop_return_path_thread_on_source(s);
> > > + QEMUFile *rp_file;
> > > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > > + rp_file = s->rp_state.from_dst_file;
> > > + s->rp_state.from_dst_file = NULL;
> > > + }
> > > + if (rp_file) {
> > > + qemu_fclose(rp_file);
> > > + }
> >
> > Open-code this is going backwards. Please see if we can reuse
> > migration_release_dst_files() at least.
> >
> > Thanks,
> >
> > >
> > > /*
> > > * Current channel is possibly broken. Release it. Note that this is
> > > @@ -3758,6 +3752,7 @@ void migration_start_outgoing(MigrationState *s)
> > > if (!qemu_file_set_blocking(s->to_dst_file, true, &local_err)) {
> > > goto fail;
> > > }
> > > + s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> > >
> > > /*
> > > * Open the return path. For postcopy, it is used exclusively. For
> > > @@ -3765,7 +3760,7 @@ void migration_start_outgoing(MigrationState *s)
> > > * QEMU uses the return path.
> > > */
> > > if (migrate_postcopy_ram() || migrate_return_path()) {
> > > - open_return_path_on_source(s);
> > > + start_return_path_thread_on_source(s);
> > > }
> > >
> > >
> > >
> > > /*
> > >
> > > --
> > > 2.39.5
> > >
> >
>
--
Peter Xu
On Fri, 27 Feb 2026 12:22:18 -0500
Peter Xu <peterx@redhat.com> wrote:
> On Fri, Feb 27, 2026 at 12:49:03PM +0100, Lukas Straub wrote:
> > On Thu, 26 Feb 2026 10:49:07 -0500
> > Peter Xu <peterx@redhat.com> wrote:
> >
> > > On Fri, Feb 20, 2026 at 08:51:41PM +0100, Lukas Straub wrote:
> > > > qemu_file_get_return_path() can not fail (See also commit 3f9d6e77b)
> > > > so always open the return path socket on the source. This allows
> > > > us to reuse the return path in other parts like colo. Also take
> > > > the proper locks in colo while we're at it.
> > > >
> > > > This fixes a crash due to a race between migrate_cancel() and
> > > > the colo thread shutting down.
> > > >
> > > > Before, the rp socket is opened just before the rp thread is started
> > > > and closed after it terminates and postcopy fast path is closed.
> > > > Now it's the same, only the rp socket stays open until migration_cleanup().
> > > >
> > > > If there is a rp thread, the rp socket is shut down at the end of migration,
> > > > but the file is still open. COLO is not compatible with postcopy, so this is
> > > > safe as no one else uses the rp socket after this point.
> > > >
> > > > Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> > > > ---
> > > > migration/colo.c | 29 ++++++-------------
> > > > migration/migration.c | 77 ++++++++++++++++++++++++---------------------------
> > > > 2 files changed, 44 insertions(+), 62 deletions(-)
> > > >
> > > > diff --git a/migration/colo.c b/migration/colo.c
> > > > index ce02c71d8857d470be434bdf3a9cacad3baab0d5..0dee33f1145b81af276cf318e2984deae9ae0527 100644
> > > > --- a/migration/colo.c
> > > > +++ b/migration/colo.c
> > > > @@ -173,11 +173,13 @@ static void primary_vm_do_failover(void)
> > > > * The s->rp_state.from_dst_file and s->to_dst_file may use the
> > > > * same fd, but we still shutdown the fd for twice, it is harmless.
> > > > */
> > > > - if (s->to_dst_file) {
> > > > - qemu_file_shutdown(s->to_dst_file);
> > > > - }
> > > > - if (s->rp_state.from_dst_file) {
> > > > - qemu_file_shutdown(s->rp_state.from_dst_file);
> > > > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > > > + if (s->to_dst_file) {
> > > > + qemu_file_shutdown(s->to_dst_file);
> > > > + }
> > > > + if (s->rp_state.from_dst_file) {
> > > > + qemu_file_shutdown(s->rp_state.from_dst_file);
> > > > + }
> > >
> > > Nit: these "start to take mutex for..." changes should belong to a separate
> > > patch.
> > >
> >
> > Will do.
> >
> > > > }
> > > >
> > > > old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
> > > > @@ -537,6 +539,7 @@ static void colo_process_checkpoint(MigrationState *s)
> > > > Error *local_err = NULL;
> > > > int ret;
> > > >
> > > > + assert(s->rp_state.from_dst_file);
> > > > if (get_colo_mode() != COLO_MODE_PRIMARY) {
> > > > error_report("COLO mode must be COLO_MODE_PRIMARY");
> > > > return;
> > > > @@ -544,12 +547,6 @@ static void colo_process_checkpoint(MigrationState *s)
> > > >
> > > > failover_init_state();
> > > >
> > > > - s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> > > > - if (!s->rp_state.from_dst_file) {
> > > > - error_report("Open QEMUFile from_dst_file failed");
> > > > - goto out;
> > > > - }
> > > > -
> > > > packets_compare_notifier.notify = colo_compare_notify_checkpoint;
> > > > colo_compare_register_notifier(&packets_compare_notifier);
> > > >
> > > > @@ -634,16 +631,6 @@ out:
> > > > colo_compare_unregister_notifier(&packets_compare_notifier);
> > > > timer_free(s->colo_delay_timer);
> > > > qemu_event_destroy(&s->colo_checkpoint_event);
> > > > -
> > > > - /*
> > > > - * Must be called after failover BH is completed,
> > > > - * Or the failover BH may shutdown the wrong fd that
> > > > - * re-used by other threads after we release here.
> > > > - */
> > > > - if (s->rp_state.from_dst_file) {
> > > > - qemu_fclose(s->rp_state.from_dst_file);
> > > > - s->rp_state.from_dst_file = NULL;
> > > > - }
> > > > }
> > > >
> > > > void migrate_start_colo_process(MigrationState *s)
> > > > diff --git a/migration/migration.c b/migration/migration.c
> > > > index f36d42ef657bdf26d78ca642d77a9b76e1c0c174..8caa56940beef12de33a799695cf486c8fbd471c 100644
> > > > --- a/migration/migration.c
> > > > +++ b/migration/migration.c
> > > > @@ -97,7 +97,7 @@ static GSList *migration_blockers[MIG_MODE__MAX];
> > > >
> > > > static bool migration_object_check(MigrationState *ms, Error **errp);
> > > > static bool migration_switchover_start(MigrationState *s, Error **errp);
> > > > -static bool close_return_path_on_source(MigrationState *s);
> > > > +static bool stop_return_path_thread_on_source(MigrationState *s);
> > > > static void migration_completion_end(MigrationState *s);
> > > >
> > > > static void migration_downtime_start(MigrationState *s)
> > > > @@ -1278,7 +1278,7 @@ static void migration_cleanup(MigrationState *s)
> > > > cpr_state_close();
> > > > cpr_transfer_source_destroy(s);
> > > >
> > > > - close_return_path_on_source(s);
> > > > + stop_return_path_thread_on_source(s);
> > > >
> > > > if (s->migration_thread_running) {
> > > > bql_unlock();
> > > > @@ -1307,6 +1307,14 @@ static void migration_cleanup(MigrationState *s)
> > > > qemu_fclose(tmp);
> > > > }
> > > >
> > > > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > > > + tmp = s->rp_state.from_dst_file;
> > > > + s->rp_state.from_dst_file = NULL;
> > > > + }
> > > > + if (tmp) {
> > > > + qemu_fclose(tmp);
> > > > + }
> > > > +
> > > > assert(!migration_is_active());
> > > >
> > > > if (s->state == MIGRATION_STATUS_CANCELLING) {
> > > > @@ -2187,38 +2195,6 @@ static bool migrate_handle_rp_resume_ack(MigrationState *s,
> > > > return true;
> > > > }
> > > >
> > > > -/*
> > > > - * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
> > > > - * existed) in a safe way.
> > > > - */
> > > > -static void migration_release_dst_files(MigrationState *ms)
> > > > -{
> > > > - QEMUFile *file = NULL;
> > > > -
> > > > - WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
> > > > - /*
> > > > - * Reset the from_dst_file pointer first before releasing it, as we
> > > > - * can't block within lock section
> > > > - */
> > > > - file = ms->rp_state.from_dst_file;
> > > > - ms->rp_state.from_dst_file = NULL;
> > > > - }
> > > > -
> > > > - /*
> > > > - * Do the same to postcopy fast path socket too if there is. No
> > > > - * locking needed because this qemufile should only be managed by
> > > > - * return path thread.
> > > > - */
> > > > - if (ms->postcopy_qemufile_src) {
> > > > - migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> > > > - qemu_file_shutdown(ms->postcopy_qemufile_src);
> > > > - qemu_fclose(ms->postcopy_qemufile_src);
> > > > - ms->postcopy_qemufile_src = NULL;
> > > > - }
> > > > -
> > > > - qemu_fclose(file);
> > > > -}
> > > > -
> > > > /*
> > > > * Handles messages sent on the return path towards the source VM
> > > > *
> > > > @@ -2388,9 +2364,9 @@ out:
> > > > return NULL;
> > > > }
> > > >
> > > > -static void open_return_path_on_source(MigrationState *ms)
> > > > +static void start_return_path_thread_on_source(MigrationState *ms)
> > >
> > > Changing this seems OK, but I'm totally confused why you deleted
> > > migration_release_dst_files(). That's the helper to properly close the two
> > > possible qemufiles that rp thread uses.
> >
> > Okay, so my reasoning here is:
> > I think that closing ms->postcopy_qemufile_src is very closely
> > related to cleaning up the rp thread so I moved that to
> > stop_return_path_thread_on_source().
>
> Hmm OK, I had that feeling you wanted to move all qemufile cleanups into
> the cleanup function.
>
> Actually I think that may still be easier for you, e.g. you can at least
> reuse the migration_release_dst_files(). I don't see much benefit yet on
> open code the preempt qemufiles..
>
> >
> > Meanwhile I think s->rp_state.from_dst_file is more closely related to
> > s->to_dst_file and in fact I think we can remove s->rp_state.from_dst_file
> > entirely in the future and use s->to_dst_file for send *and* receive
> > just like you would with a normal socket.
> >
> > So I close s->rp_state.from_dst_file right besides s->to_dst_file in
> > this patch. And I do the same open-coded file lock dance like
> > s->to_dst_file already does.
>
> Let's not do open coded things. That makes things worse.
>
> If you also agree we can cleanup qemufile alawys only until migration
> cleanup then we can stick with it for now.
>
> >
> > > IIUC you can keep it then use it
> > > in either postcopy_pause() or migration_cleanup() directly.
> >
> > I can do that if you wish.
> > Should I keep closing ms->postcopy_qemufile_src inside
> > migration_release_dst_files() or stop_return_path_thread_on_source() ?
>
> If you use migration_release_dst_files() in both (1) postcopy pause and (2)
> migration cleanup, IIUC we should covered all cases. But please double
> check.
Even better idea: Add a helper that closes s->to_dst_file and s->rp_state.from_dst_file
and use that in postcopy pause and migration cleanup.
What do you think?
> >
> > >
> > > Then you need to remove the chunk [1] below, making the function only do
> > > "stop" but not close.
> > >
> > > > {
> > > > - ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
> > > > + assert(ms->rp_state.from_dst_file);
> > >
> > > I don't see why this change is a must, to open from_dst_file earlier. Can
> > > we keep it as before, or would you justify it in a separate patch?
> >
> > Well, the issue is that opening the return path file is behind this if:
> >
> > if (migrate_postcopy_ram() || migrate_return_path()) {
> > - open_return_path_on_source(s);
> > + start_return_path_thread_on_source(s);
> > }
> >
> > And I want to always open the return path file, without also starting
> > the return path thread.
>
> I never notice this small difference.. then logically COLO should just
> depend on return-path capability.
>
> IIUC the simplest for your series to move on is:
>
> - If COLO always require return-path (I hope you know the best... e.g. do
> you enable return-path cap for your colo deployment?), we can add that
> enforcement in migrate_caps_check(). It's an ABI break only to COLO,
> but if you're OK, I don't see it an issue.
>
> - Just add one more condition above into:
>
> if (migrate_postcopy_ram() || migrate_return_path() || migrate_colo()) {
> open_return_path_on_source(s);
> }
>
> Instead of making rp complicated; after all many places assume rp thread
> is there when rp qemufile is there, vice versa. Changing that needs more
> monitoring of code base, IMHO. Better stick with it to be simple.
That doesn't work either, because now you have the rp thread running
and to stop the thread we
qemu_file_shutdown(ms->rp_state.from_dst_file).
And we're back to square one because we can't reuse the shut
s->rp_state.from_dst_file in colo.
In fact looking at the code, qemu_file_shutdown() shuts both
ms->rp_state.from_dst_file *and* s->to_dst_file because
it calls qio_channel_shutdown(f->ioc, QIO_CHANNEL_SHUTDOWN_BOTH) and
f->ioc is the same io channel for both files.
So after stop_return_path_thread_on_source() our connection the the
secondary is severed. That's now workable.
>
> >
> > >
> > > >
> > > > trace_open_return_path_on_source();
> > > >
> > > > @@ -2402,7 +2378,7 @@ static void open_return_path_on_source(MigrationState *ms)
> > > > }
> > > >
> > > > /* Return true if error detected, or false otherwise */
> > > > -static bool close_return_path_on_source(MigrationState *ms)
> > > > +static bool stop_return_path_thread_on_source(MigrationState *ms)
> > > > {
> > > > if (!ms->rp_state.rp_thread_created) {
> > > > return false;
> > > > @@ -2424,7 +2400,17 @@ static bool close_return_path_on_source(MigrationState *ms)
> > > >
> > > > qemu_thread_join(&ms->rp_state.rp_thread);
> > > > ms->rp_state.rp_thread_created = false;
> > > > - migration_release_dst_files(ms);
> > > > + /*
> > > > + * Close the postcopy fast path socket if there is one.
> > > > + * No locking needed because this qemufile should only be managed by
> > > > + * return path thread which we just stopped.
> > > > + */
> > > > + if (ms->postcopy_qemufile_src) {
> > > > + migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> > > > + qemu_file_shutdown(ms->postcopy_qemufile_src);
> > > > + qemu_fclose(ms->postcopy_qemufile_src);
> > > > + ms->postcopy_qemufile_src = NULL;
> > > > + }
> > >
> > > [1]
> > >
> > > > trace_migration_return_path_end_after();
> > > >
> > > > /* Return path will persist the error in MigrationState when quit */
> > > > @@ -2787,7 +2773,7 @@ static void migration_completion(MigrationState *s)
> > > > goto fail;
> > > > }
> > > >
> > > > - if (close_return_path_on_source(s)) {
> > > > + if (stop_return_path_thread_on_source(s)) {
> > > > goto fail;
> > > > }
> > > >
> > > > @@ -2941,7 +2927,15 @@ static MigThrError postcopy_pause(MigrationState *s)
> > > > * path and just wait for the thread to finish. It will be
> > > > * re-created when we resume.
> > > > */
> > > > - close_return_path_on_source(s);
> > > > + stop_return_path_thread_on_source(s);
> > > > + QEMUFile *rp_file;
> > > > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > > > + rp_file = s->rp_state.from_dst_file;
> > > > + s->rp_state.from_dst_file = NULL;
> > > > + }
> > > > + if (rp_file) {
> > > > + qemu_fclose(rp_file);
> > > > + }
> > >
> > > Open-code this is going backwards. Please see if we can reuse
> > > migration_release_dst_files() at least.
> > >
> > > Thanks,
> > >
> > > >
> > > > /*
> > > > * Current channel is possibly broken. Release it. Note that this is
> > > > @@ -3758,6 +3752,7 @@ void migration_start_outgoing(MigrationState *s)
> > > > if (!qemu_file_set_blocking(s->to_dst_file, true, &local_err)) {
> > > > goto fail;
> > > > }
> > > > + s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> > > >
> > > > /*
> > > > * Open the return path. For postcopy, it is used exclusively. For
> > > > @@ -3765,7 +3760,7 @@ void migration_start_outgoing(MigrationState *s)
> > > > * QEMU uses the return path.
> > > > */
> > > > if (migrate_postcopy_ram() || migrate_return_path()) {
> > > > - open_return_path_on_source(s);
> > > > + start_return_path_thread_on_source(s);
> > > > }
> > > >
> > > >
> > > >
> > > > /*
> > > >
> > > > --
> > > > 2.39.5
> > > >
> > >
> >
>
>
>
On Fri, Feb 27, 2026 at 06:59:14PM +0100, Lukas Straub wrote:
> On Fri, 27 Feb 2026 12:22:18 -0500
> Peter Xu <peterx@redhat.com> wrote:
>
> > On Fri, Feb 27, 2026 at 12:49:03PM +0100, Lukas Straub wrote:
> > > On Thu, 26 Feb 2026 10:49:07 -0500
> > > Peter Xu <peterx@redhat.com> wrote:
> > >
> > > > On Fri, Feb 20, 2026 at 08:51:41PM +0100, Lukas Straub wrote:
> > > > > qemu_file_get_return_path() can not fail (See also commit 3f9d6e77b)
> > > > > so always open the return path socket on the source. This allows
> > > > > us to reuse the return path in other parts like colo. Also take
> > > > > the proper locks in colo while we're at it.
> > > > >
> > > > > This fixes a crash due to a race between migrate_cancel() and
> > > > > the colo thread shutting down.
> > > > >
> > > > > Before, the rp socket is opened just before the rp thread is started
> > > > > and closed after it terminates and postcopy fast path is closed.
> > > > > Now it's the same, only the rp socket stays open until migration_cleanup().
> > > > >
> > > > > If there is a rp thread, the rp socket is shut down at the end of migration,
> > > > > but the file is still open. COLO is not compatible with postcopy, so this is
> > > > > safe as no one else uses the rp socket after this point.
> > > > >
> > > > > Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> > > > > ---
> > > > > migration/colo.c | 29 ++++++-------------
> > > > > migration/migration.c | 77 ++++++++++++++++++++++++---------------------------
> > > > > 2 files changed, 44 insertions(+), 62 deletions(-)
> > > > >
> > > > > diff --git a/migration/colo.c b/migration/colo.c
> > > > > index ce02c71d8857d470be434bdf3a9cacad3baab0d5..0dee33f1145b81af276cf318e2984deae9ae0527 100644
> > > > > --- a/migration/colo.c
> > > > > +++ b/migration/colo.c
> > > > > @@ -173,11 +173,13 @@ static void primary_vm_do_failover(void)
> > > > > * The s->rp_state.from_dst_file and s->to_dst_file may use the
> > > > > * same fd, but we still shutdown the fd for twice, it is harmless.
> > > > > */
> > > > > - if (s->to_dst_file) {
> > > > > - qemu_file_shutdown(s->to_dst_file);
> > > > > - }
> > > > > - if (s->rp_state.from_dst_file) {
> > > > > - qemu_file_shutdown(s->rp_state.from_dst_file);
> > > > > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > > > > + if (s->to_dst_file) {
> > > > > + qemu_file_shutdown(s->to_dst_file);
> > > > > + }
> > > > > + if (s->rp_state.from_dst_file) {
> > > > > + qemu_file_shutdown(s->rp_state.from_dst_file);
> > > > > + }
> > > >
> > > > Nit: these "start to take mutex for..." changes should belong to a separate
> > > > patch.
> > > >
> > >
> > > Will do.
> > >
> > > > > }
> > > > >
> > > > > old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
> > > > > @@ -537,6 +539,7 @@ static void colo_process_checkpoint(MigrationState *s)
> > > > > Error *local_err = NULL;
> > > > > int ret;
> > > > >
> > > > > + assert(s->rp_state.from_dst_file);
> > > > > if (get_colo_mode() != COLO_MODE_PRIMARY) {
> > > > > error_report("COLO mode must be COLO_MODE_PRIMARY");
> > > > > return;
> > > > > @@ -544,12 +547,6 @@ static void colo_process_checkpoint(MigrationState *s)
> > > > >
> > > > > failover_init_state();
> > > > >
> > > > > - s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> > > > > - if (!s->rp_state.from_dst_file) {
> > > > > - error_report("Open QEMUFile from_dst_file failed");
> > > > > - goto out;
> > > > > - }
> > > > > -
> > > > > packets_compare_notifier.notify = colo_compare_notify_checkpoint;
> > > > > colo_compare_register_notifier(&packets_compare_notifier);
> > > > >
> > > > > @@ -634,16 +631,6 @@ out:
> > > > > colo_compare_unregister_notifier(&packets_compare_notifier);
> > > > > timer_free(s->colo_delay_timer);
> > > > > qemu_event_destroy(&s->colo_checkpoint_event);
> > > > > -
> > > > > - /*
> > > > > - * Must be called after failover BH is completed,
> > > > > - * Or the failover BH may shutdown the wrong fd that
> > > > > - * re-used by other threads after we release here.
> > > > > - */
> > > > > - if (s->rp_state.from_dst_file) {
> > > > > - qemu_fclose(s->rp_state.from_dst_file);
> > > > > - s->rp_state.from_dst_file = NULL;
> > > > > - }
> > > > > }
> > > > >
> > > > > void migrate_start_colo_process(MigrationState *s)
> > > > > diff --git a/migration/migration.c b/migration/migration.c
> > > > > index f36d42ef657bdf26d78ca642d77a9b76e1c0c174..8caa56940beef12de33a799695cf486c8fbd471c 100644
> > > > > --- a/migration/migration.c
> > > > > +++ b/migration/migration.c
> > > > > @@ -97,7 +97,7 @@ static GSList *migration_blockers[MIG_MODE__MAX];
> > > > >
> > > > > static bool migration_object_check(MigrationState *ms, Error **errp);
> > > > > static bool migration_switchover_start(MigrationState *s, Error **errp);
> > > > > -static bool close_return_path_on_source(MigrationState *s);
> > > > > +static bool stop_return_path_thread_on_source(MigrationState *s);
> > > > > static void migration_completion_end(MigrationState *s);
> > > > >
> > > > > static void migration_downtime_start(MigrationState *s)
> > > > > @@ -1278,7 +1278,7 @@ static void migration_cleanup(MigrationState *s)
> > > > > cpr_state_close();
> > > > > cpr_transfer_source_destroy(s);
> > > > >
> > > > > - close_return_path_on_source(s);
> > > > > + stop_return_path_thread_on_source(s);
> > > > >
> > > > > if (s->migration_thread_running) {
> > > > > bql_unlock();
> > > > > @@ -1307,6 +1307,14 @@ static void migration_cleanup(MigrationState *s)
> > > > > qemu_fclose(tmp);
> > > > > }
> > > > >
> > > > > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > > > > + tmp = s->rp_state.from_dst_file;
> > > > > + s->rp_state.from_dst_file = NULL;
> > > > > + }
> > > > > + if (tmp) {
> > > > > + qemu_fclose(tmp);
> > > > > + }
> > > > > +
> > > > > assert(!migration_is_active());
> > > > >
> > > > > if (s->state == MIGRATION_STATUS_CANCELLING) {
> > > > > @@ -2187,38 +2195,6 @@ static bool migrate_handle_rp_resume_ack(MigrationState *s,
> > > > > return true;
> > > > > }
> > > > >
> > > > > -/*
> > > > > - * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
> > > > > - * existed) in a safe way.
> > > > > - */
> > > > > -static void migration_release_dst_files(MigrationState *ms)
> > > > > -{
> > > > > - QEMUFile *file = NULL;
> > > > > -
> > > > > - WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
> > > > > - /*
> > > > > - * Reset the from_dst_file pointer first before releasing it, as we
> > > > > - * can't block within lock section
> > > > > - */
> > > > > - file = ms->rp_state.from_dst_file;
> > > > > - ms->rp_state.from_dst_file = NULL;
> > > > > - }
> > > > > -
> > > > > - /*
> > > > > - * Do the same to postcopy fast path socket too if there is. No
> > > > > - * locking needed because this qemufile should only be managed by
> > > > > - * return path thread.
> > > > > - */
> > > > > - if (ms->postcopy_qemufile_src) {
> > > > > - migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> > > > > - qemu_file_shutdown(ms->postcopy_qemufile_src);
> > > > > - qemu_fclose(ms->postcopy_qemufile_src);
> > > > > - ms->postcopy_qemufile_src = NULL;
> > > > > - }
> > > > > -
> > > > > - qemu_fclose(file);
> > > > > -}
> > > > > -
> > > > > /*
> > > > > * Handles messages sent on the return path towards the source VM
> > > > > *
> > > > > @@ -2388,9 +2364,9 @@ out:
> > > > > return NULL;
> > > > > }
> > > > >
> > > > > -static void open_return_path_on_source(MigrationState *ms)
> > > > > +static void start_return_path_thread_on_source(MigrationState *ms)
> > > >
> > > > Changing this seems OK, but I'm totally confused why you deleted
> > > > migration_release_dst_files(). That's the helper to properly close the two
> > > > possible qemufiles that rp thread uses.
> > >
> > > Okay, so my reasoning here is:
> > > I think that closing ms->postcopy_qemufile_src is very closely
> > > related to cleaning up the rp thread so I moved that to
> > > stop_return_path_thread_on_source().
> >
> > Hmm OK, I had that feeling you wanted to move all qemufile cleanups into
> > the cleanup function.
> >
> > Actually I think that may still be easier for you, e.g. you can at least
> > reuse the migration_release_dst_files(). I don't see much benefit yet on
> > open code the preempt qemufiles..
> >
> > >
> > > Meanwhile I think s->rp_state.from_dst_file is more closely related to
> > > s->to_dst_file and in fact I think we can remove s->rp_state.from_dst_file
> > > entirely in the future and use s->to_dst_file for send *and* receive
> > > just like you would with a normal socket.
> > >
> > > So I close s->rp_state.from_dst_file right besides s->to_dst_file in
> > > this patch. And I do the same open-coded file lock dance like
> > > s->to_dst_file already does.
> >
> > Let's not do open coded things. That makes things worse.
> >
> > If you also agree we can cleanup qemufile alawys only until migration
> > cleanup then we can stick with it for now.
> >
> > >
> > > > IIUC you can keep it then use it
> > > > in either postcopy_pause() or migration_cleanup() directly.
> > >
> > > I can do that if you wish.
> > > Should I keep closing ms->postcopy_qemufile_src inside
> > > migration_release_dst_files() or stop_return_path_thread_on_source() ?
> >
> > If you use migration_release_dst_files() in both (1) postcopy pause and (2)
> > migration cleanup, IIUC we should covered all cases. But please double
> > check.
>
> Even better idea: Add a helper that closes s->to_dst_file and s->rp_state.from_dst_file
> and use that in postcopy pause and migration cleanup.
>
> What do you think?
If you mean having the helper close all three possible qemufiles, then it
sounds OK to me. As long as your new code will still properly manage the
preempt channel and not open-code it I'm OK.
>
> > >
> > > >
> > > > Then you need to remove the chunk [1] below, making the function only do
> > > > "stop" but not close.
> > > >
> > > > > {
> > > > > - ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
> > > > > + assert(ms->rp_state.from_dst_file);
> > > >
> > > > I don't see why this change is a must, to open from_dst_file earlier. Can
> > > > we keep it as before, or would you justify it in a separate patch?
> > >
> > > Well, the issue is that opening the return path file is behind this if:
> > >
> > > if (migrate_postcopy_ram() || migrate_return_path()) {
> > > - open_return_path_on_source(s);
> > > + start_return_path_thread_on_source(s);
> > > }
> > >
> > > And I want to always open the return path file, without also starting
> > > the return path thread.
> >
> > I never notice this small difference.. then logically COLO should just
> > depend on return-path capability.
> >
> > IIUC the simplest for your series to move on is:
> >
> > - If COLO always require return-path (I hope you know the best... e.g. do
> > you enable return-path cap for your colo deployment?), we can add that
> > enforcement in migrate_caps_check(). It's an ABI break only to COLO,
> > but if you're OK, I don't see it an issue.
> >
> > - Just add one more condition above into:
> >
> > if (migrate_postcopy_ram() || migrate_return_path() || migrate_colo()) {
> > open_return_path_on_source(s);
> > }
> >
> > Instead of making rp complicated; after all many places assume rp thread
> > is there when rp qemufile is there, vice versa. Changing that needs more
> > monitoring of code base, IMHO. Better stick with it to be simple.
>
> That doesn't work either, because now you have the rp thread running
> and to stop the thread we
> qemu_file_shutdown(ms->rp_state.from_dst_file).
Do you mean this line?
WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
if (migrate_has_error(ms) && ms->rp_state.from_dst_file) {
qemu_file_shutdown(ms->rp_state.from_dst_file);
}
}
We don't shut it down if the migration succeeded, but rely on RP_SHUT
message. I think it should still be true for COLO. If something is wrong,
migration will fail and it won't switch to COLO state. Otherwise it won't
shut so IIUC COLO can reuse it.
> And we're back to square one because we can't reuse the shut
> s->rp_state.from_dst_file in colo.
>
> In fact looking at the code, qemu_file_shutdown() shuts both
> ms->rp_state.from_dst_file *and* s->to_dst_file because
> it calls qio_channel_shutdown(f->ioc, QIO_CHANNEL_SHUTDOWN_BOTH) and
> f->ioc is the same io channel for both files.
>
> So after stop_return_path_thread_on_source() our connection the the
> secondary is severed. That's now workable.
>
> >
> > >
> > > >
> > > > >
> > > > > trace_open_return_path_on_source();
> > > > >
> > > > > @@ -2402,7 +2378,7 @@ static void open_return_path_on_source(MigrationState *ms)
> > > > > }
> > > > >
> > > > > /* Return true if error detected, or false otherwise */
> > > > > -static bool close_return_path_on_source(MigrationState *ms)
> > > > > +static bool stop_return_path_thread_on_source(MigrationState *ms)
> > > > > {
> > > > > if (!ms->rp_state.rp_thread_created) {
> > > > > return false;
> > > > > @@ -2424,7 +2400,17 @@ static bool close_return_path_on_source(MigrationState *ms)
> > > > >
> > > > > qemu_thread_join(&ms->rp_state.rp_thread);
> > > > > ms->rp_state.rp_thread_created = false;
> > > > > - migration_release_dst_files(ms);
> > > > > + /*
> > > > > + * Close the postcopy fast path socket if there is one.
> > > > > + * No locking needed because this qemufile should only be managed by
> > > > > + * return path thread which we just stopped.
> > > > > + */
> > > > > + if (ms->postcopy_qemufile_src) {
> > > > > + migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> > > > > + qemu_file_shutdown(ms->postcopy_qemufile_src);
> > > > > + qemu_fclose(ms->postcopy_qemufile_src);
> > > > > + ms->postcopy_qemufile_src = NULL;
> > > > > + }
> > > >
> > > > [1]
> > > >
> > > > > trace_migration_return_path_end_after();
> > > > >
> > > > > /* Return path will persist the error in MigrationState when quit */
> > > > > @@ -2787,7 +2773,7 @@ static void migration_completion(MigrationState *s)
> > > > > goto fail;
> > > > > }
> > > > >
> > > > > - if (close_return_path_on_source(s)) {
> > > > > + if (stop_return_path_thread_on_source(s)) {
> > > > > goto fail;
> > > > > }
> > > > >
> > > > > @@ -2941,7 +2927,15 @@ static MigThrError postcopy_pause(MigrationState *s)
> > > > > * path and just wait for the thread to finish. It will be
> > > > > * re-created when we resume.
> > > > > */
> > > > > - close_return_path_on_source(s);
> > > > > + stop_return_path_thread_on_source(s);
> > > > > + QEMUFile *rp_file;
> > > > > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > > > > + rp_file = s->rp_state.from_dst_file;
> > > > > + s->rp_state.from_dst_file = NULL;
> > > > > + }
> > > > > + if (rp_file) {
> > > > > + qemu_fclose(rp_file);
> > > > > + }
> > > >
> > > > Open-code this is going backwards. Please see if we can reuse
> > > > migration_release_dst_files() at least.
> > > >
> > > > Thanks,
> > > >
> > > > >
> > > > > /*
> > > > > * Current channel is possibly broken. Release it. Note that this is
> > > > > @@ -3758,6 +3752,7 @@ void migration_start_outgoing(MigrationState *s)
> > > > > if (!qemu_file_set_blocking(s->to_dst_file, true, &local_err)) {
> > > > > goto fail;
> > > > > }
> > > > > + s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> > > > >
> > > > > /*
> > > > > * Open the return path. For postcopy, it is used exclusively. For
> > > > > @@ -3765,7 +3760,7 @@ void migration_start_outgoing(MigrationState *s)
> > > > > * QEMU uses the return path.
> > > > > */
> > > > > if (migrate_postcopy_ram() || migrate_return_path()) {
> > > > > - open_return_path_on_source(s);
> > > > > + start_return_path_thread_on_source(s);
> > > > > }
> > > > >
> > > > >
> > > > >
> > > > > /*
> > > > >
> > > > > --
> > > > > 2.39.5
> > > > >
> > > >
> > >
> >
> >
> >
>
--
Peter Xu
© 2016 - 2026 Red Hat, Inc.