[v9] migration: Add COLO multifd support and COLO migration unit test

[PATCH v9 19/19] colo: Fix a rare crash during shutdown

Posted by Lukas Straub 1 month, 3 weeks ago

In the colo migration unit test, we shutdown all sockets with yank and
then stop qemu with SIGTERM. During shutdown migration_shutdown() calls
migration_cancel(). Now the colo thread frees s->rp_state.from_dst_file
which races with migration_cancel() checking for NULL and potentially calling
qemu_file_shutdown() on it.

Fix this by taking the s->qemu_file_lock.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
---
 migration/colo.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/migration/colo.c b/migration/colo.c
index ce02c71d8857d470be434bdf3a9cacad3baab0d5..180793fe3f25140fa10887acc3d87515ebf43ac9 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -173,11 +173,13 @@ static void primary_vm_do_failover(void)
      * The s->rp_state.from_dst_file and s->to_dst_file may use the
      * same fd, but we still shutdown the fd for twice, it is harmless.
      */
-    if (s->to_dst_file) {
-        qemu_file_shutdown(s->to_dst_file);
-    }
-    if (s->rp_state.from_dst_file) {
-        qemu_file_shutdown(s->rp_state.from_dst_file);
+    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
+        if (s->to_dst_file) {
+            qemu_file_shutdown(s->to_dst_file);
+        }
+        if (s->rp_state.from_dst_file) {
+            qemu_file_shutdown(s->rp_state.from_dst_file);
+        }
     }
 
     old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
@@ -544,11 +546,14 @@ static void colo_process_checkpoint(MigrationState *s)
 
     failover_init_state();
 
+    qemu_mutex_lock(&s->qemu_file_lock);
     s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
     if (!s->rp_state.from_dst_file) {
+        qemu_mutex_unlock(&s->qemu_file_lock);
         error_report("Open QEMUFile from_dst_file failed");
         goto out;
     }
+    qemu_mutex_unlock(&s->qemu_file_lock);
 
     packets_compare_notifier.notify = colo_compare_notify_checkpoint;
     colo_compare_register_notifier(&packets_compare_notifier);
@@ -640,9 +645,11 @@ out:
      * Or the failover BH may shutdown the wrong fd that
      * re-used by other threads after we release here.
      */
-    if (s->rp_state.from_dst_file) {
-        qemu_fclose(s->rp_state.from_dst_file);
-        s->rp_state.from_dst_file = NULL;
+    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
+        if (s->rp_state.from_dst_file) {
+            qemu_fclose(s->rp_state.from_dst_file);
+            s->rp_state.from_dst_file = NULL;
+        }
     }
 }
 

-- 
2.39.5

Re: [PATCH v9 19/19] colo: Fix a rare crash during shutdown

Posted by Peter Xu 1 month, 3 weeks ago

On Wed, Feb 18, 2026 at 10:29:39PM +0100, Lukas Straub wrote:
> In the colo migration unit test, we shutdown all sockets with yank and
> then stop qemu with SIGTERM. During shutdown migration_shutdown() calls
> migration_cancel(). Now the colo thread frees s->rp_state.from_dst_file
> which races with migration_cancel() checking for NULL and potentially calling
> qemu_file_shutdown() on it.
> 
> Fix this by taking the s->qemu_file_lock.
> 
> Signed-off-by: Lukas Straub <lukasstraub2@web.de>

The whole patch taking the mutex is reasonable, but it didn't explain one
thing, on why COLO is special here on managing the return path channel..

colo_process_checkpoint() does re-opening of rp channel even if it was
partially shutdown before:

colo_process_checkpoint():
    s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);

Should we instead leave all these to migration core?
E.g. migration_completion() will do close_return_path_on_source(), but I
wonder if that should only shutdown & close the return path channel when
without COLO running.  Then IIUC we can also leave the cleanup of the
qemufiles to migration_cleanup(), as it's also not special to COLO IIUC.

What do you think?

Thanks,

> ---
>  migration/colo.c | 23 +++++++++++++++--------
>  1 file changed, 15 insertions(+), 8 deletions(-)
> 
> diff --git a/migration/colo.c b/migration/colo.c
> index ce02c71d8857d470be434bdf3a9cacad3baab0d5..180793fe3f25140fa10887acc3d87515ebf43ac9 100644
> --- a/migration/colo.c
> +++ b/migration/colo.c
> @@ -173,11 +173,13 @@ static void primary_vm_do_failover(void)
>       * The s->rp_state.from_dst_file and s->to_dst_file may use the
>       * same fd, but we still shutdown the fd for twice, it is harmless.
>       */
> -    if (s->to_dst_file) {
> -        qemu_file_shutdown(s->to_dst_file);
> -    }
> -    if (s->rp_state.from_dst_file) {
> -        qemu_file_shutdown(s->rp_state.from_dst_file);
> +    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> +        if (s->to_dst_file) {
> +            qemu_file_shutdown(s->to_dst_file);
> +        }
> +        if (s->rp_state.from_dst_file) {
> +            qemu_file_shutdown(s->rp_state.from_dst_file);
> +        }
>      }
>  
>      old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
> @@ -544,11 +546,14 @@ static void colo_process_checkpoint(MigrationState *s)
>  
>      failover_init_state();
>  
> +    qemu_mutex_lock(&s->qemu_file_lock);
>      s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
>      if (!s->rp_state.from_dst_file) {
> +        qemu_mutex_unlock(&s->qemu_file_lock);
>          error_report("Open QEMUFile from_dst_file failed");
>          goto out;
>      }
> +    qemu_mutex_unlock(&s->qemu_file_lock);
>  
>      packets_compare_notifier.notify = colo_compare_notify_checkpoint;
>      colo_compare_register_notifier(&packets_compare_notifier);
> @@ -640,9 +645,11 @@ out:
>       * Or the failover BH may shutdown the wrong fd that
>       * re-used by other threads after we release here.
>       */
> -    if (s->rp_state.from_dst_file) {
> -        qemu_fclose(s->rp_state.from_dst_file);
> -        s->rp_state.from_dst_file = NULL;
> +    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> +        if (s->rp_state.from_dst_file) {
> +            qemu_fclose(s->rp_state.from_dst_file);
> +            s->rp_state.from_dst_file = NULL;
> +        }
>      }
>  }
>  
> 
> -- 
> 2.39.5
> 

-- 
Peter Xu

Re: [PATCH v9 19/19] colo: Fix a rare crash during shutdown

Posted by Lukas Straub 1 month, 2 weeks ago

On Thu, 19 Feb 2026 16:33:19 -0500
Peter Xu <peterx@redhat.com> wrote:

> On Wed, Feb 18, 2026 at 10:29:39PM +0100, Lukas Straub wrote:
> > In the colo migration unit test, we shutdown all sockets with yank and
> > then stop qemu with SIGTERM. During shutdown migration_shutdown() calls
> > migration_cancel(). Now the colo thread frees s->rp_state.from_dst_file
> > which races with migration_cancel() checking for NULL and potentially calling
> > qemu_file_shutdown() on it.
> > 
> > Fix this by taking the s->qemu_file_lock.
> > 
> > Signed-off-by: Lukas Straub <lukasstraub2@web.de>  
> 
> The whole patch taking the mutex is reasonable, but it didn't explain one
> thing, on why COLO is special here on managing the return path channel..
> 
> colo_process_checkpoint() does re-opening of rp channel even if it was
> partially shutdown before:
> 
> colo_process_checkpoint():
>     s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> 
> Should we instead leave all these to migration core?
> E.g. migration_completion() will do close_return_path_on_source(), but I
> wonder if that should only shutdown & close the return path channel when
> without COLO running.  Then IIUC we can also leave the cleanup of the
> qemufiles to migration_cleanup(), as it's also not special to COLO IIUC.
> 
> What do you think?

It is a bit more involved since the return path needs to be always
opened for this. I implemented this in my current patchset.

Regards,
Lukas Straub

> 
> Thanks,
> 
> > ---
> >  migration/colo.c | 23 +++++++++++++++--------
> >  1 file changed, 15 insertions(+), 8 deletions(-)
> > 
> > diff --git a/migration/colo.c b/migration/colo.c
> > index ce02c71d8857d470be434bdf3a9cacad3baab0d5..180793fe3f25140fa10887acc3d87515ebf43ac9 100644
> > --- a/migration/colo.c
> > +++ b/migration/colo.c
> > @@ -173,11 +173,13 @@ static void primary_vm_do_failover(void)
> >       * The s->rp_state.from_dst_file and s->to_dst_file may use the
> >       * same fd, but we still shutdown the fd for twice, it is harmless.
> >       */
> > -    if (s->to_dst_file) {
> > -        qemu_file_shutdown(s->to_dst_file);
> > -    }
> > -    if (s->rp_state.from_dst_file) {
> > -        qemu_file_shutdown(s->rp_state.from_dst_file);
> > +    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > +        if (s->to_dst_file) {
> > +            qemu_file_shutdown(s->to_dst_file);
> > +        }
> > +        if (s->rp_state.from_dst_file) {
> > +            qemu_file_shutdown(s->rp_state.from_dst_file);
> > +        }
> >      }
> >  
> >      old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
> > @@ -544,11 +546,14 @@ static void colo_process_checkpoint(MigrationState *s)
> >  
> >      failover_init_state();
> >  
> > +    qemu_mutex_lock(&s->qemu_file_lock);
> >      s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> >      if (!s->rp_state.from_dst_file) {
> > +        qemu_mutex_unlock(&s->qemu_file_lock);
> >          error_report("Open QEMUFile from_dst_file failed");
> >          goto out;
> >      }
> > +    qemu_mutex_unlock(&s->qemu_file_lock);
> >  
> >      packets_compare_notifier.notify = colo_compare_notify_checkpoint;
> >      colo_compare_register_notifier(&packets_compare_notifier);
> > @@ -640,9 +645,11 @@ out:
> >       * Or the failover BH may shutdown the wrong fd that
> >       * re-used by other threads after we release here.
> >       */
> > -    if (s->rp_state.from_dst_file) {
> > -        qemu_fclose(s->rp_state.from_dst_file);
> > -        s->rp_state.from_dst_file = NULL;
> > +    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > +        if (s->rp_state.from_dst_file) {
> > +            qemu_fclose(s->rp_state.from_dst_file);
> > +            s->rp_state.from_dst_file = NULL;
> > +        }
> >      }
> >  }
> >  
> > 
> > -- 
> > 2.39.5
> >   
>