[PATCH v2 5/8] migration-test: Add COLO migration unit test

Lukas Straub posted 8 patches 2 weeks, 6 days ago
Maintainers: Peter Xu <peterx@redhat.com>, Fabiano Rosas <farosas@suse.de>, Lukas Straub <lukasstraub2@web.de>, Laurent Vivier <lvivier@redhat.com>, Paolo Bonzini <pbonzini@redhat.com>
There is a newer version of this series
[PATCH v2 5/8] migration-test: Add COLO migration unit test
Posted by Lukas Straub 2 weeks, 6 days ago
Add a COLO migration test for COLO migration and failover.

COLO does not support q35 machine at this time.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
---
 MAINTAINERS                        |   1 +
 tests/qtest/meson.build            |   7 ++-
 tests/qtest/migration-test.c       |   1 +
 tests/qtest/migration/colo-tests.c | 113 +++++++++++++++++++++++++++++++++++++
 tests/qtest/migration/framework.c  |  87 +++++++++++++++++++++++++++-
 tests/qtest/migration/framework.h  |  10 ++++
 6 files changed, 217 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index dbb217255c2cf35dc0ce971c2021b130fac5469b..92ca20c9d4186a08519d15bfe8cbd583ab061a8b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3840,6 +3840,7 @@ F: migration/colo*
 F: migration/multifd-colo.*
 F: include/migration/colo.h
 F: include/migration/failover.h
+F: tests/qtest/migration/colo-tests.c
 F: docs/COLO-FT.txt
 
 COLO Proxy
diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index 0f053fb56de5806d3c213e3a26c0b19998ae151a..d0129af4431bb08a94a918a1e40a8f657059d764 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -367,6 +367,11 @@ if gnutls.found()
   endif
 endif
 
+migration_colo_files = []
+if get_option('replication').allowed()
+  migration_colo_files = [files('migration/colo-tests.c')]
+endif
+
 qtests = {
   'aspeed_hace-test': files('aspeed-hace-utils.c', 'aspeed_hace-test.c'),
   'aspeed_smc-test': files('aspeed-smc-utils.c', 'aspeed_smc-test.c'),
@@ -378,7 +383,7 @@ qtests = {
                              'migration/migration-util.c') + dbus_vmstate1,
   'erst-test': files('erst-test.c'),
   'ivshmem-test': [rt, '../../contrib/ivshmem-server/ivshmem-server.c'],
-  'migration-test': test_migration_files + migration_tls_files,
+  'migration-test': test_migration_files + migration_tls_files + migration_colo_files,
   'pxe-test': files('boot-sector.c'),
   'pnv-xive2-test': files('pnv-xive2-common.c', 'pnv-xive2-flush-sync.c',
                           'pnv-xive2-nvpg_bar.c'),
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 08936871741535c926eeac40a7d7c3f461c72fd0..e582f05c7dc2673dbd05a936df8feb6c964b5bbc 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -55,6 +55,7 @@ int main(int argc, char **argv)
     migration_test_add_precopy(env);
     migration_test_add_cpr(env);
     migration_test_add_misc(env);
+    migration_test_add_colo(env);
 
     ret = g_test_run();
 
diff --git a/tests/qtest/migration/colo-tests.c b/tests/qtest/migration/colo-tests.c
new file mode 100644
index 0000000000000000000000000000000000000000..5004f581e4d9e4e6f54eee6d70a9307b7fd123be
--- /dev/null
+++ b/tests/qtest/migration/colo-tests.c
@@ -0,0 +1,113 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * QTest testcases for COLO migration
+ *
+ * Copyright (c) 2025 Lukas Straub <lukasstraub2@web.de>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "libqtest.h"
+#include "migration/framework.h"
+#include "migration/migration-qmp.h"
+#include "migration/migration-util.h"
+#include "qemu/module.h"
+
+static void test_colo_plain_common(MigrateCommon *args,
+                                   bool failover_during_checkpoint,
+                                   bool primary_failover)
+{
+    args->listen_uri = "tcp:127.0.0.1:0";
+    test_colo_common(args, failover_during_checkpoint, primary_failover);
+}
+
+static void *hook_start_multifd(QTestState *from, QTestState *to)
+{
+    return migrate_hook_start_precopy_tcp_multifd_common(from, to, "none");
+}
+
+static void test_colo_multifd_common(MigrateCommon *args,
+                                     bool failover_during_checkpoint,
+                                     bool primary_failover)
+{
+    args->listen_uri = "defer";
+    args->start_hook = hook_start_multifd;
+    args->start.caps[MIGRATION_CAPABILITY_MULTIFD] = true;
+    test_colo_common(args, failover_during_checkpoint, primary_failover);
+}
+
+static void test_colo_plain_primary_failover(char *name, MigrateCommon *args)
+{
+    test_colo_plain_common(args, false, true);
+}
+
+static void test_colo_plain_secondary_failover(char *name, MigrateCommon *args)
+{
+    test_colo_plain_common(args, false, false);
+}
+
+static void test_colo_multifd_primary_failover(char *name, MigrateCommon *args)
+{
+    test_colo_multifd_common(args, false, true);
+}
+
+static void test_colo_multifd_secondary_failover(char *name,
+                                                 MigrateCommon *args)
+{
+    test_colo_multifd_common(args, false, false);
+}
+
+static void test_colo_plain_primary_failover_checkpoint(char *name,
+                                                        MigrateCommon *args)
+{
+    test_colo_plain_common(args, true, true);
+}
+
+static void test_colo_plain_secondary_failover_checkpoint(char *name,
+                                                          MigrateCommon *args)
+{
+    test_colo_plain_common(args, true, false);
+}
+
+static void test_colo_multifd_primary_failover_checkpoint(char *name,
+                                                          MigrateCommon *args)
+{
+    test_colo_multifd_common(args, true, true);
+}
+
+static void test_colo_multifd_secondary_failover_checkpoint(char *name,
+                                                            MigrateCommon *args)
+{
+    test_colo_multifd_common(args, true, false);
+}
+
+void migration_test_add_colo(MigrationTestEnv *env)
+{
+    if (!env->full_set) {
+        return;
+    }
+
+    migration_test_add("/migration/colo/plain/primary_failover",
+                       test_colo_plain_primary_failover);
+    migration_test_add("/migration/colo/plain/secondary_failover",
+                       test_colo_plain_secondary_failover);
+
+    migration_test_add("/migration/colo/multifd/primary_failover",
+                       test_colo_multifd_primary_failover);
+    migration_test_add("/migration/colo/multifd/secondary_failover",
+                       test_colo_multifd_secondary_failover);
+
+    migration_test_add("/migration/colo/plain/primary_failover_checkpoint",
+                       test_colo_plain_primary_failover_checkpoint);
+    migration_test_add("/migration/colo/plain/secondary_failover_checkpoint",
+                       test_colo_plain_secondary_failover_checkpoint);
+
+    migration_test_add("/migration/colo/multifd/primary_failover_checkpoint",
+                       test_colo_multifd_primary_failover_checkpoint);
+    migration_test_add("/migration/colo/multifd/secondary_failover_checkpoint",
+                       test_colo_multifd_secondary_failover_checkpoint);
+}
diff --git a/tests/qtest/migration/framework.c b/tests/qtest/migration/framework.c
index 57d3b9b7c5a269d31659971e308367bd916d28f6..fe34e7cc7a1a4eeb8d5219f54733bbd8446b0e4e 100644
--- a/tests/qtest/migration/framework.c
+++ b/tests/qtest/migration/framework.c
@@ -315,7 +315,7 @@ int migrate_args(char **from, char **to, const char *uri, MigrateStart *args)
     if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
         memory_size = "150M";
 
-        if (g_str_equal(arch, "i386")) {
+        if (g_str_equal(arch, "i386") || args->force_pc_machine) {
             machine_alias = "pc";
         } else {
             machine_alias = "q35";
@@ -1066,6 +1066,91 @@ void *migrate_hook_start_precopy_tcp_multifd_common(QTestState *from,
     return NULL;
 }
 
+int test_colo_common(MigrateCommon *args, bool failover_during_checkpoint,
+                     bool primary_failover)
+{
+    QTestState *from, *to;
+    void *data_hook = NULL;
+
+    /*
+     * For the COLO test, both VMs will run in parallel. Thus both VMs want to
+     * open the image read/write at the same time. Using read-only=on is not
+     * possible here, because ide-hd does not support read-only backing image.
+     *
+     * So use -snapshot, where each qemu instance creates its own writable
+     * snapshot internally while leaving the real image read-only.
+     */
+    args->start.opts_source = "-snapshot";
+    args->start.opts_target = "-snapshot";
+
+    /*
+     * COLO migration code logs many errors when the migration socket
+     * is shut down, these are expected so we hide them here.
+     */
+    args->start.hide_stderr = true;
+
+    /*
+     * COLO currently does not work with Q35 machine
+     */
+    args->start.force_pc_machine = true;
+
+    args->start.oob = true;
+    args->start.caps[MIGRATION_CAPABILITY_X_COLO] = true;
+
+    if (migrate_start(&from, &to, args->listen_uri, &args->start)) {
+        return -1;
+    }
+
+    migrate_set_parameter_int(from, "x-checkpoint-delay", 300);
+
+    if (args->start_hook) {
+        data_hook = args->start_hook(from, to);
+    }
+
+    migrate_ensure_converge(from);
+    wait_for_serial("src_serial");
+
+    migrate_qmp(from, to, args->connect_uri, NULL, "{}");
+
+    wait_for_migration_status(from, "colo", NULL);
+    wait_for_resume(to, &dst_state);
+
+    wait_for_serial("src_serial");
+    wait_for_serial("dest_serial");
+
+    /* wait for 3 checkpoints */
+    for (int i = 0; i < 3; i++) {
+        qtest_qmp_eventwait(to, "RESUME");
+        wait_for_serial("src_serial");
+        wait_for_serial("dest_serial");
+    }
+
+    if (failover_during_checkpoint) {
+        qtest_qmp_eventwait(to, "STOP");
+    }
+    if (primary_failover) {
+        qtest_qmp_assert_success(from, "{'exec-oob': 'yank', 'id': 'yank-cmd', "
+                                            "'arguments': {'instances':"
+                                                "[{'type': 'migration'}]}}");
+        qtest_qmp_assert_success(from, "{'execute': 'x-colo-lost-heartbeat'}");
+        wait_for_serial("src_serial");
+    } else {
+        qtest_qmp_assert_success(to, "{'exec-oob': 'yank', 'id': 'yank-cmd', "
+                                        "'arguments': {'instances':"
+                                            "[{'type': 'migration'}]}}");
+        qtest_qmp_assert_success(to, "{'execute': 'x-colo-lost-heartbeat'}");
+        wait_for_serial("dest_serial");
+    }
+
+    if (args->end_hook) {
+        args->end_hook(from, to, data_hook);
+    }
+
+    migrate_end(from, to, !primary_failover);
+
+    return 0;
+}
+
 QTestMigrationState *get_src(void)
 {
     return &src_state;
diff --git a/tests/qtest/migration/framework.h b/tests/qtest/migration/framework.h
index 2ef0f57962605c9e3bc7b7de48e52351e5389138..75088c5fb098a0f95acb1e23585d3b6e8307451e 100644
--- a/tests/qtest/migration/framework.h
+++ b/tests/qtest/migration/framework.h
@@ -139,6 +139,9 @@ typedef struct {
     /* Do not connect to target monitor and qtest sockets in qtest_init */
     bool defer_target_connect;
 
+    /* Use pc machine for x86_64 */
+    bool force_pc_machine;
+
     /*
      * Migration capabilities to be set in both source and
      * destination. For unilateral capabilities, use
@@ -248,6 +251,8 @@ void test_postcopy_common(MigrateCommon *args);
 void test_postcopy_recovery_common(MigrateCommon *args);
 int test_precopy_common(MigrateCommon *args);
 void test_file_common(MigrateCommon *args, bool stop_src);
+int test_colo_common(MigrateCommon *args, bool failover_during_checkpoint,
+                     bool colo_primary_failover);
 void *migrate_hook_start_precopy_tcp_multifd_common(QTestState *from,
                                                     QTestState *to,
                                                     const char *method);
@@ -267,5 +272,10 @@ void migration_test_add_file(MigrationTestEnv *env);
 void migration_test_add_precopy(MigrationTestEnv *env);
 void migration_test_add_cpr(MigrationTestEnv *env);
 void migration_test_add_misc(MigrationTestEnv *env);
+#ifdef CONFIG_REPLICATION
+void migration_test_add_colo(MigrationTestEnv *env);
+#else
+static inline void migration_test_add_colo(MigrationTestEnv *env) {};
+#endif
 
 #endif /* TEST_FRAMEWORK_H */

-- 
2.39.5
Re: [PATCH v2 5/8] migration-test: Add COLO migration unit test
Posted by Peter Xu 2 weeks, 3 days ago
On Sat, Jan 17, 2026 at 03:09:12PM +0100, Lukas Straub wrote:
> Add a COLO migration test for COLO migration and failover.
> 
> COLO does not support q35 machine at this time.
> 
> Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> ---
>  MAINTAINERS                        |   1 +
>  tests/qtest/meson.build            |   7 ++-
>  tests/qtest/migration-test.c       |   1 +
>  tests/qtest/migration/colo-tests.c | 113 +++++++++++++++++++++++++++++++++++++
>  tests/qtest/migration/framework.c  |  87 +++++++++++++++++++++++++++-
>  tests/qtest/migration/framework.h  |  10 ++++
>  6 files changed, 217 insertions(+), 2 deletions(-)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index dbb217255c2cf35dc0ce971c2021b130fac5469b..92ca20c9d4186a08519d15bfe8cbd583ab061a8b 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3840,6 +3840,7 @@ F: migration/colo*
>  F: migration/multifd-colo.*
>  F: include/migration/colo.h
>  F: include/migration/failover.h
> +F: tests/qtest/migration/colo-tests.c
>  F: docs/COLO-FT.txt
>  
>  COLO Proxy
> diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
> index 0f053fb56de5806d3c213e3a26c0b19998ae151a..d0129af4431bb08a94a918a1e40a8f657059d764 100644
> --- a/tests/qtest/meson.build
> +++ b/tests/qtest/meson.build
> @@ -367,6 +367,11 @@ if gnutls.found()
>    endif
>  endif
>  
> +migration_colo_files = []
> +if get_option('replication').allowed()
> +  migration_colo_files = [files('migration/colo-tests.c')]
> +endif
> +
>  qtests = {
>    'aspeed_hace-test': files('aspeed-hace-utils.c', 'aspeed_hace-test.c'),
>    'aspeed_smc-test': files('aspeed-smc-utils.c', 'aspeed_smc-test.c'),
> @@ -378,7 +383,7 @@ qtests = {
>                               'migration/migration-util.c') + dbus_vmstate1,
>    'erst-test': files('erst-test.c'),
>    'ivshmem-test': [rt, '../../contrib/ivshmem-server/ivshmem-server.c'],
> -  'migration-test': test_migration_files + migration_tls_files,
> +  'migration-test': test_migration_files + migration_tls_files + migration_colo_files,
>    'pxe-test': files('boot-sector.c'),
>    'pnv-xive2-test': files('pnv-xive2-common.c', 'pnv-xive2-flush-sync.c',
>                            'pnv-xive2-nvpg_bar.c'),
> diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
> index 08936871741535c926eeac40a7d7c3f461c72fd0..e582f05c7dc2673dbd05a936df8feb6c964b5bbc 100644
> --- a/tests/qtest/migration-test.c
> +++ b/tests/qtest/migration-test.c
> @@ -55,6 +55,7 @@ int main(int argc, char **argv)
>      migration_test_add_precopy(env);
>      migration_test_add_cpr(env);
>      migration_test_add_misc(env);
> +    migration_test_add_colo(env);
>  
>      ret = g_test_run();
>  
> diff --git a/tests/qtest/migration/colo-tests.c b/tests/qtest/migration/colo-tests.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..5004f581e4d9e4e6f54eee6d70a9307b7fd123be
> --- /dev/null
> +++ b/tests/qtest/migration/colo-tests.c
> @@ -0,0 +1,113 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * QTest testcases for COLO migration
> + *
> + * Copyright (c) 2025 Lukas Straub <lukasstraub2@web.de>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu/osdep.h"
> +#include "libqtest.h"
> +#include "migration/framework.h"
> +#include "migration/migration-qmp.h"
> +#include "migration/migration-util.h"
> +#include "qemu/module.h"
> +
> +static void test_colo_plain_common(MigrateCommon *args,
> +                                   bool failover_during_checkpoint,
> +                                   bool primary_failover)
> +{
> +    args->listen_uri = "tcp:127.0.0.1:0";
> +    test_colo_common(args, failover_during_checkpoint, primary_failover);
> +}
> +
> +static void *hook_start_multifd(QTestState *from, QTestState *to)
> +{
> +    return migrate_hook_start_precopy_tcp_multifd_common(from, to, "none");
> +}
> +
> +static void test_colo_multifd_common(MigrateCommon *args,
> +                                     bool failover_during_checkpoint,
> +                                     bool primary_failover)
> +{
> +    args->listen_uri = "defer";
> +    args->start_hook = hook_start_multifd;
> +    args->start.caps[MIGRATION_CAPABILITY_MULTIFD] = true;
> +    test_colo_common(args, failover_during_checkpoint, primary_failover);
> +}
> +
> +static void test_colo_plain_primary_failover(char *name, MigrateCommon *args)
> +{
> +    test_colo_plain_common(args, false, true);
> +}
> +
> +static void test_colo_plain_secondary_failover(char *name, MigrateCommon *args)
> +{
> +    test_colo_plain_common(args, false, false);
> +}
> +
> +static void test_colo_multifd_primary_failover(char *name, MigrateCommon *args)
> +{
> +    test_colo_multifd_common(args, false, true);
> +}
> +
> +static void test_colo_multifd_secondary_failover(char *name,
> +                                                 MigrateCommon *args)
> +{
> +    test_colo_multifd_common(args, false, false);
> +}
> +
> +static void test_colo_plain_primary_failover_checkpoint(char *name,
> +                                                        MigrateCommon *args)
> +{
> +    test_colo_plain_common(args, true, true);
> +}
> +
> +static void test_colo_plain_secondary_failover_checkpoint(char *name,
> +                                                          MigrateCommon *args)
> +{
> +    test_colo_plain_common(args, true, false);
> +}
> +
> +static void test_colo_multifd_primary_failover_checkpoint(char *name,
> +                                                          MigrateCommon *args)
> +{
> +    test_colo_multifd_common(args, true, true);
> +}
> +
> +static void test_colo_multifd_secondary_failover_checkpoint(char *name,
> +                                                            MigrateCommon *args)
> +{
> +    test_colo_multifd_common(args, true, false);
> +}
> +
> +void migration_test_add_colo(MigrationTestEnv *env)
> +{
> +    if (!env->full_set) {
> +        return;
> +    }
> +
> +    migration_test_add("/migration/colo/plain/primary_failover",
> +                       test_colo_plain_primary_failover);
> +    migration_test_add("/migration/colo/plain/secondary_failover",
> +                       test_colo_plain_secondary_failover);
> +
> +    migration_test_add("/migration/colo/multifd/primary_failover",
> +                       test_colo_multifd_primary_failover);
> +    migration_test_add("/migration/colo/multifd/secondary_failover",
> +                       test_colo_multifd_secondary_failover);
> +
> +    migration_test_add("/migration/colo/plain/primary_failover_checkpoint",
> +                       test_colo_plain_primary_failover_checkpoint);
> +    migration_test_add("/migration/colo/plain/secondary_failover_checkpoint",
> +                       test_colo_plain_secondary_failover_checkpoint);
> +
> +    migration_test_add("/migration/colo/multifd/primary_failover_checkpoint",
> +                       test_colo_multifd_primary_failover_checkpoint);
> +    migration_test_add("/migration/colo/multifd/secondary_failover_checkpoint",
> +                       test_colo_multifd_secondary_failover_checkpoint);
> +}
> diff --git a/tests/qtest/migration/framework.c b/tests/qtest/migration/framework.c
> index 57d3b9b7c5a269d31659971e308367bd916d28f6..fe34e7cc7a1a4eeb8d5219f54733bbd8446b0e4e 100644
> --- a/tests/qtest/migration/framework.c
> +++ b/tests/qtest/migration/framework.c
> @@ -315,7 +315,7 @@ int migrate_args(char **from, char **to, const char *uri, MigrateStart *args)
>      if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
>          memory_size = "150M";
>  
> -        if (g_str_equal(arch, "i386")) {
> +        if (g_str_equal(arch, "i386") || args->force_pc_machine) {

The naming is better, thanks.  Said that, force_pc_machine is unwanted
either.. if we can drop it.  I asked this in v1:

https://lore.kernel.org/qemu-devel/aWltRH6Nra-Tji7w@x1.local/

Can we explore that possibility?

>              machine_alias = "pc";
>          } else {
>              machine_alias = "q35";
> @@ -1066,6 +1066,91 @@ void *migrate_hook_start_precopy_tcp_multifd_common(QTestState *from,
>      return NULL;
>  }
>  
> +int test_colo_common(MigrateCommon *args, bool failover_during_checkpoint,
> +                     bool primary_failover)
> +{
> +    QTestState *from, *to;
> +    void *data_hook = NULL;
> +
> +    /*
> +     * For the COLO test, both VMs will run in parallel. Thus both VMs want to
> +     * open the image read/write at the same time. Using read-only=on is not
> +     * possible here, because ide-hd does not support read-only backing image.
> +     *
> +     * So use -snapshot, where each qemu instance creates its own writable
> +     * snapshot internally while leaving the real image read-only.
> +     */
> +    args->start.opts_source = "-snapshot";
> +    args->start.opts_target = "-snapshot";
> +
> +    /*
> +     * COLO migration code logs many errors when the migration socket
> +     * is shut down, these are expected so we hide them here.
> +     */
> +    args->start.hide_stderr = true;
> +
> +    /*
> +     * COLO currently does not work with Q35 machine
> +     */
> +    args->start.force_pc_machine = true;
> +
> +    args->start.oob = true;

Just curious: is OOB required in COLO for some reason?  I understand yank
you used below uses OOB, so the question is behind that, on what can be
blocked in main thread, and special in COLO.

> +    args->start.caps[MIGRATION_CAPABILITY_X_COLO] = true;
> +
> +    if (migrate_start(&from, &to, args->listen_uri, &args->start)) {
> +        return -1;
> +    }
> +
> +    migrate_set_parameter_int(from, "x-checkpoint-delay", 300);
> +
> +    if (args->start_hook) {
> +        data_hook = args->start_hook(from, to);
> +    }
> +
> +    migrate_ensure_converge(from);
> +    wait_for_serial("src_serial");
> +
> +    migrate_qmp(from, to, args->connect_uri, NULL, "{}");
> +
> +    wait_for_migration_status(from, "colo", NULL);
> +    wait_for_resume(to, &dst_state);

We can move this whole function into colo-tests.c.  Here you may want to
use get_dst() instead.

> +
> +    wait_for_serial("src_serial");
> +    wait_for_serial("dest_serial");
> +
> +    /* wait for 3 checkpoints */
> +    for (int i = 0; i < 3; i++) {
> +        qtest_qmp_eventwait(to, "RESUME");
> +        wait_for_serial("src_serial");
> +        wait_for_serial("dest_serial");
> +    }
> +
> +    if (failover_during_checkpoint) {
> +        qtest_qmp_eventwait(to, "STOP");
> +    }
> +    if (primary_failover) {
> +        qtest_qmp_assert_success(from, "{'exec-oob': 'yank', 'id': 'yank-cmd', "
> +                                            "'arguments': {'instances':"
> +                                                "[{'type': 'migration'}]}}");
> +        qtest_qmp_assert_success(from, "{'execute': 'x-colo-lost-heartbeat'}");
> +        wait_for_serial("src_serial");
> +    } else {
> +        qtest_qmp_assert_success(to, "{'exec-oob': 'yank', 'id': 'yank-cmd', "
> +                                        "'arguments': {'instances':"
> +                                            "[{'type': 'migration'}]}}");
> +        qtest_qmp_assert_success(to, "{'execute': 'x-colo-lost-heartbeat'}");
> +        wait_for_serial("dest_serial");
> +    }
> +
> +    if (args->end_hook) {
> +        args->end_hook(from, to, data_hook);
> +    }
> +
> +    migrate_end(from, to, !primary_failover);
> +
> +    return 0;
> +}
> +
>  QTestMigrationState *get_src(void)
>  {
>      return &src_state;
> diff --git a/tests/qtest/migration/framework.h b/tests/qtest/migration/framework.h
> index 2ef0f57962605c9e3bc7b7de48e52351e5389138..75088c5fb098a0f95acb1e23585d3b6e8307451e 100644
> --- a/tests/qtest/migration/framework.h
> +++ b/tests/qtest/migration/framework.h
> @@ -139,6 +139,9 @@ typedef struct {
>      /* Do not connect to target monitor and qtest sockets in qtest_init */
>      bool defer_target_connect;
>  
> +    /* Use pc machine for x86_64 */
> +    bool force_pc_machine;
> +
>      /*
>       * Migration capabilities to be set in both source and
>       * destination. For unilateral capabilities, use
> @@ -248,6 +251,8 @@ void test_postcopy_common(MigrateCommon *args);
>  void test_postcopy_recovery_common(MigrateCommon *args);
>  int test_precopy_common(MigrateCommon *args);
>  void test_file_common(MigrateCommon *args, bool stop_src);
> +int test_colo_common(MigrateCommon *args, bool failover_during_checkpoint,
> +                     bool colo_primary_failover);
>  void *migrate_hook_start_precopy_tcp_multifd_common(QTestState *from,
>                                                      QTestState *to,
>                                                      const char *method);
> @@ -267,5 +272,10 @@ void migration_test_add_file(MigrationTestEnv *env);
>  void migration_test_add_precopy(MigrationTestEnv *env);
>  void migration_test_add_cpr(MigrationTestEnv *env);
>  void migration_test_add_misc(MigrationTestEnv *env);
> +#ifdef CONFIG_REPLICATION
> +void migration_test_add_colo(MigrationTestEnv *env);
> +#else
> +static inline void migration_test_add_colo(MigrationTestEnv *env) {};
> +#endif
>  
>  #endif /* TEST_FRAMEWORK_H */
> 
> -- 
> 2.39.5
> 

-- 
Peter Xu
Re: [PATCH v2 5/8] migration-test: Add COLO migration unit test
Posted by Lukas Straub 2 weeks, 2 days ago
On Tue, 20 Jan 2026 12:23:08 -0500
Peter Xu <peterx@redhat.com> wrote:

> On Sat, Jan 17, 2026 at 03:09:12PM +0100, Lukas Straub wrote:
> > Add a COLO migration test for COLO migration and failover.
> > 
> > COLO does not support q35 machine at this time.
> > 
> > Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> > ---
> >  MAINTAINERS                        |   1 +
> >  tests/qtest/meson.build            |   7 ++-
> >  tests/qtest/migration-test.c       |   1 +
> >  tests/qtest/migration/colo-tests.c | 113 +++++++++++++++++++++++++++++++++++++
> >  tests/qtest/migration/framework.c  |  87 +++++++++++++++++++++++++++-
> >  tests/qtest/migration/framework.h  |  10 ++++
> >  6 files changed, 217 insertions(+), 2 deletions(-)
> > 
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index dbb217255c2cf35dc0ce971c2021b130fac5469b..92ca20c9d4186a08519d15bfe8cbd583ab061a8b 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -3840,6 +3840,7 @@ F: migration/colo*
> >  F: migration/multifd-colo.*
> >  F: include/migration/colo.h
> >  F: include/migration/failover.h
> > +F: tests/qtest/migration/colo-tests.c
> >  F: docs/COLO-FT.txt
> >  
> >  COLO Proxy
> > diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
> > index 0f053fb56de5806d3c213e3a26c0b19998ae151a..d0129af4431bb08a94a918a1e40a8f657059d764 100644
> > --- a/tests/qtest/meson.build
> > +++ b/tests/qtest/meson.build
> > @@ -367,6 +367,11 @@ if gnutls.found()
> >    endif
> >  endif
> >  
> > +migration_colo_files = []
> > +if get_option('replication').allowed()
> > +  migration_colo_files = [files('migration/colo-tests.c')]
> > +endif
> > +
> >  qtests = {
> >    'aspeed_hace-test': files('aspeed-hace-utils.c', 'aspeed_hace-test.c'),
> >    'aspeed_smc-test': files('aspeed-smc-utils.c', 'aspeed_smc-test.c'),
> > @@ -378,7 +383,7 @@ qtests = {
> >                               'migration/migration-util.c') + dbus_vmstate1,
> >    'erst-test': files('erst-test.c'),
> >    'ivshmem-test': [rt, '../../contrib/ivshmem-server/ivshmem-server.c'],
> > -  'migration-test': test_migration_files + migration_tls_files,
> > +  'migration-test': test_migration_files + migration_tls_files + migration_colo_files,
> >    'pxe-test': files('boot-sector.c'),
> >    'pnv-xive2-test': files('pnv-xive2-common.c', 'pnv-xive2-flush-sync.c',
> >                            'pnv-xive2-nvpg_bar.c'),
> > diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
> > index 08936871741535c926eeac40a7d7c3f461c72fd0..e582f05c7dc2673dbd05a936df8feb6c964b5bbc 100644
> > --- a/tests/qtest/migration-test.c
> > +++ b/tests/qtest/migration-test.c
> > @@ -55,6 +55,7 @@ int main(int argc, char **argv)
> >      migration_test_add_precopy(env);
> >      migration_test_add_cpr(env);
> >      migration_test_add_misc(env);
> > +    migration_test_add_colo(env);
> >  
> >      ret = g_test_run();
> >  
> > diff --git a/tests/qtest/migration/colo-tests.c b/tests/qtest/migration/colo-tests.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..5004f581e4d9e4e6f54eee6d70a9307b7fd123be
> > --- /dev/null
> > +++ b/tests/qtest/migration/colo-tests.c
> > @@ -0,0 +1,113 @@
> > +/*
> > + * SPDX-License-Identifier: GPL-2.0-or-later
> > + *
> > + * QTest testcases for COLO migration
> > + *
> > + * Copyright (c) 2025 Lukas Straub <lukasstraub2@web.de>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > + * See the COPYING file in the top-level directory.
> > + *
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "libqtest.h"
> > +#include "migration/framework.h"
> > +#include "migration/migration-qmp.h"
> > +#include "migration/migration-util.h"
> > +#include "qemu/module.h"
> > +
> > +static void test_colo_plain_common(MigrateCommon *args,
> > +                                   bool failover_during_checkpoint,
> > +                                   bool primary_failover)
> > +{
> > +    args->listen_uri = "tcp:127.0.0.1:0";
> > +    test_colo_common(args, failover_during_checkpoint, primary_failover);
> > +}
> > +
> > +static void *hook_start_multifd(QTestState *from, QTestState *to)
> > +{
> > +    return migrate_hook_start_precopy_tcp_multifd_common(from, to, "none");
> > +}
> > +
> > +static void test_colo_multifd_common(MigrateCommon *args,
> > +                                     bool failover_during_checkpoint,
> > +                                     bool primary_failover)
> > +{
> > +    args->listen_uri = "defer";
> > +    args->start_hook = hook_start_multifd;
> > +    args->start.caps[MIGRATION_CAPABILITY_MULTIFD] = true;
> > +    test_colo_common(args, failover_during_checkpoint, primary_failover);
> > +}
> > +
> > +static void test_colo_plain_primary_failover(char *name, MigrateCommon *args)
> > +{
> > +    test_colo_plain_common(args, false, true);
> > +}
> > +
> > +static void test_colo_plain_secondary_failover(char *name, MigrateCommon *args)
> > +{
> > +    test_colo_plain_common(args, false, false);
> > +}
> > +
> > +static void test_colo_multifd_primary_failover(char *name, MigrateCommon *args)
> > +{
> > +    test_colo_multifd_common(args, false, true);
> > +}
> > +
> > +static void test_colo_multifd_secondary_failover(char *name,
> > +                                                 MigrateCommon *args)
> > +{
> > +    test_colo_multifd_common(args, false, false);
> > +}
> > +
> > +static void test_colo_plain_primary_failover_checkpoint(char *name,
> > +                                                        MigrateCommon *args)
> > +{
> > +    test_colo_plain_common(args, true, true);
> > +}
> > +
> > +static void test_colo_plain_secondary_failover_checkpoint(char *name,
> > +                                                          MigrateCommon *args)
> > +{
> > +    test_colo_plain_common(args, true, false);
> > +}
> > +
> > +static void test_colo_multifd_primary_failover_checkpoint(char *name,
> > +                                                          MigrateCommon *args)
> > +{
> > +    test_colo_multifd_common(args, true, true);
> > +}
> > +
> > +static void test_colo_multifd_secondary_failover_checkpoint(char *name,
> > +                                                            MigrateCommon *args)
> > +{
> > +    test_colo_multifd_common(args, true, false);
> > +}
> > +
> > +void migration_test_add_colo(MigrationTestEnv *env)
> > +{
> > +    if (!env->full_set) {
> > +        return;
> > +    }
> > +
> > +    migration_test_add("/migration/colo/plain/primary_failover",
> > +                       test_colo_plain_primary_failover);
> > +    migration_test_add("/migration/colo/plain/secondary_failover",
> > +                       test_colo_plain_secondary_failover);
> > +
> > +    migration_test_add("/migration/colo/multifd/primary_failover",
> > +                       test_colo_multifd_primary_failover);
> > +    migration_test_add("/migration/colo/multifd/secondary_failover",
> > +                       test_colo_multifd_secondary_failover);
> > +
> > +    migration_test_add("/migration/colo/plain/primary_failover_checkpoint",
> > +                       test_colo_plain_primary_failover_checkpoint);
> > +    migration_test_add("/migration/colo/plain/secondary_failover_checkpoint",
> > +                       test_colo_plain_secondary_failover_checkpoint);
> > +
> > +    migration_test_add("/migration/colo/multifd/primary_failover_checkpoint",
> > +                       test_colo_multifd_primary_failover_checkpoint);
> > +    migration_test_add("/migration/colo/multifd/secondary_failover_checkpoint",
> > +                       test_colo_multifd_secondary_failover_checkpoint);
> > +}
> > diff --git a/tests/qtest/migration/framework.c b/tests/qtest/migration/framework.c
> > index 57d3b9b7c5a269d31659971e308367bd916d28f6..fe34e7cc7a1a4eeb8d5219f54733bbd8446b0e4e 100644
> > --- a/tests/qtest/migration/framework.c
> > +++ b/tests/qtest/migration/framework.c
> > @@ -315,7 +315,7 @@ int migrate_args(char **from, char **to, const char *uri, MigrateStart *args)
> >      if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
> >          memory_size = "150M";
> >  
> > -        if (g_str_equal(arch, "i386")) {
> > +        if (g_str_equal(arch, "i386") || args->force_pc_machine) {  
> 
> The naming is better, thanks.  Said that, force_pc_machine is unwanted
> either.. if we can drop it.  I asked this in v1:
> 
> https://lore.kernel.org/qemu-devel/aWltRH6Nra-Tji7w@x1.local/
> 
> Can we explore that possibility?

Never mind. I found the issue and will remove this in the next version.

> 
> >              machine_alias = "pc";
> >          } else {
> >              machine_alias = "q35";
> > @@ -1066,6 +1066,91 @@ void *migrate_hook_start_precopy_tcp_multifd_common(QTestState *from,
> >      return NULL;
> >  }
> >  
> > +int test_colo_common(MigrateCommon *args, bool failover_during_checkpoint,
> > +                     bool primary_failover)
> > +{
> > +    QTestState *from, *to;
> > +    void *data_hook = NULL;
> > +
> > +    /*
> > +     * For the COLO test, both VMs will run in parallel. Thus both VMs want to
> > +     * open the image read/write at the same time. Using read-only=on is not
> > +     * possible here, because ide-hd does not support read-only backing image.
> > +     *
> > +     * So use -snapshot, where each qemu instance creates its own writable
> > +     * snapshot internally while leaving the real image read-only.
> > +     */
> > +    args->start.opts_source = "-snapshot";
> > +    args->start.opts_target = "-snapshot";
> > +
> > +    /*
> > +     * COLO migration code logs many errors when the migration socket
> > +     * is shut down, these are expected so we hide them here.
> > +     */
> > +    args->start.hide_stderr = true;
> > +
> > +    /*
> > +     * COLO currently does not work with Q35 machine
> > +     */
> > +    args->start.force_pc_machine = true;
> > +
> > +    args->start.oob = true;  
> 
> Just curious: is OOB required in COLO for some reason?  I understand yank
> you used below uses OOB, so the question is behind that, on what can be
> blocked in main thread, and special in COLO.
> 
> > +    args->start.caps[MIGRATION_CAPABILITY_X_COLO] = true;
> > +
> > +    if (migrate_start(&from, &to, args->listen_uri, &args->start)) {
> > +        return -1;
> > +    }
> > +
> > +    migrate_set_parameter_int(from, "x-checkpoint-delay", 300);
> > +
> > +    if (args->start_hook) {
> > +        data_hook = args->start_hook(from, to);
> > +    }
> > +
> > +    migrate_ensure_converge(from);
> > +    wait_for_serial("src_serial");
> > +
> > +    migrate_qmp(from, to, args->connect_uri, NULL, "{}");
> > +
> > +    wait_for_migration_status(from, "colo", NULL);
> > +    wait_for_resume(to, &dst_state);  
> 
> We can move this whole function into colo-tests.c.  Here you may want to
> use get_dst() instead.

Okey, will do that.

> 
> > +
> > +    wait_for_serial("src_serial");
> > +    wait_for_serial("dest_serial");
> > +
> > +    /* wait for 3 checkpoints */
> > +    for (int i = 0; i < 3; i++) {
> > +        qtest_qmp_eventwait(to, "RESUME");
> > +        wait_for_serial("src_serial");
> > +        wait_for_serial("dest_serial");
> > +    }
> > +
> > +    if (failover_during_checkpoint) {
> > +        qtest_qmp_eventwait(to, "STOP");
> > +    }
> > +    if (primary_failover) {
> > +        qtest_qmp_assert_success(from, "{'exec-oob': 'yank', 'id': 'yank-cmd', "
> > +                                            "'arguments': {'instances':"
> > +                                                "[{'type': 'migration'}]}}");
> > +        qtest_qmp_assert_success(from, "{'execute': 'x-colo-lost-heartbeat'}");
> > +        wait_for_serial("src_serial");
> > +    } else {
> > +        qtest_qmp_assert_success(to, "{'exec-oob': 'yank', 'id': 'yank-cmd', "
> > +                                        "'arguments': {'instances':"
> > +                                            "[{'type': 'migration'}]}}");
> > +        qtest_qmp_assert_success(to, "{'execute': 'x-colo-lost-heartbeat'}");
> > +        wait_for_serial("dest_serial");
> > +    }
> > +
> > +    if (args->end_hook) {
> > +        args->end_hook(from, to, data_hook);
> > +    }
> > +
> > +    migrate_end(from, to, !primary_failover);
> > +
> > +    return 0;
> > +}
> > +
> >  QTestMigrationState *get_src(void)
> >  {
> >      return &src_state;
> > diff --git a/tests/qtest/migration/framework.h b/tests/qtest/migration/framework.h
> > index 2ef0f57962605c9e3bc7b7de48e52351e5389138..75088c5fb098a0f95acb1e23585d3b6e8307451e 100644
> > --- a/tests/qtest/migration/framework.h
> > +++ b/tests/qtest/migration/framework.h
> > @@ -139,6 +139,9 @@ typedef struct {
> >      /* Do not connect to target monitor and qtest sockets in qtest_init */
> >      bool defer_target_connect;
> >  
> > +    /* Use pc machine for x86_64 */
> > +    bool force_pc_machine;
> > +
> >      /*
> >       * Migration capabilities to be set in both source and
> >       * destination. For unilateral capabilities, use
> > @@ -248,6 +251,8 @@ void test_postcopy_common(MigrateCommon *args);
> >  void test_postcopy_recovery_common(MigrateCommon *args);
> >  int test_precopy_common(MigrateCommon *args);
> >  void test_file_common(MigrateCommon *args, bool stop_src);
> > +int test_colo_common(MigrateCommon *args, bool failover_during_checkpoint,
> > +                     bool colo_primary_failover);
> >  void *migrate_hook_start_precopy_tcp_multifd_common(QTestState *from,
> >                                                      QTestState *to,
> >                                                      const char *method);
> > @@ -267,5 +272,10 @@ void migration_test_add_file(MigrationTestEnv *env);
> >  void migration_test_add_precopy(MigrationTestEnv *env);
> >  void migration_test_add_cpr(MigrationTestEnv *env);
> >  void migration_test_add_misc(MigrationTestEnv *env);
> > +#ifdef CONFIG_REPLICATION
> > +void migration_test_add_colo(MigrationTestEnv *env);
> > +#else
> > +static inline void migration_test_add_colo(MigrationTestEnv *env) {};
> > +#endif
> >  
> >  #endif /* TEST_FRAMEWORK_H */
> > 
> > -- 
> > 2.39.5
> >   
> 

Re: [PATCH v2 5/8] migration-test: Add COLO migration unit test
Posted by Lukas Straub 1 week, 5 days ago
On Wed, 21 Jan 2026 20:37:51 +0100
Lukas Straub <lukasstraub2@web.de> wrote:

> On Tue, 20 Jan 2026 12:23:08 -0500
> Peter Xu <peterx@redhat.com> wrote:
> 
> > On Sat, Jan 17, 2026 at 03:09:12PM +0100, Lukas Straub wrote:  
> > > Add a COLO migration test for COLO migration and failover.
> > > 
> > > COLO does not support q35 machine at this time.
> > > 
> > > [...]
> > >  
> > > +int test_colo_common(MigrateCommon *args, bool failover_during_checkpoint,
> > > +                     bool primary_failover)
> > > +{
> > > +    QTestState *from, *to;
> > > +    void *data_hook = NULL;
> > > +
> > > +    /*
> > > +     * For the COLO test, both VMs will run in parallel. Thus both VMs want to
> > > +     * open the image read/write at the same time. Using read-only=on is not
> > > +     * possible here, because ide-hd does not support read-only backing image.
> > > +     *
> > > +     * So use -snapshot, where each qemu instance creates its own writable
> > > +     * snapshot internally while leaving the real image read-only.
> > > +     */
> > > +    args->start.opts_source = "-snapshot";
> > > +    args->start.opts_target = "-snapshot";
> > > +
> > > +    /*
> > > +     * COLO migration code logs many errors when the migration socket
> > > +     * is shut down, these are expected so we hide them here.
> > > +     */
> > > +    args->start.hide_stderr = true;
> > > +
> > > +    /*
> > > +     * COLO currently does not work with Q35 machine
> > > +     */
> > > +    args->start.force_pc_machine = true;
> > > +
> > > +    args->start.oob = true;    
> > 
> > Just curious: is OOB required in COLO for some reason?  I understand yank
> > you used below uses OOB, so the question is behind that, on what can be
> > blocked in main thread, and special in COLO.

There is a lot that can hang:
The netfilters all run on the main loop and use blocking write.
fiter-mirror on the primary side mirrors packets to the secondary and
can hang.
filter-redirect on the secondary side redirects packets to primary's
colo-compare and can hang.
The nbd client on the primary side that is connected to the nbd server
on the secondary side can hang. Especially during vm_stop() which fluses
all inflight block io with BQL held.

Regards,
Lukas Straub

> >   
> > > +    args->start.caps[MIGRATION_CAPABILITY_X_COLO] = true;
> > > +
> > > +    if (migrate_start(&from, &to, args->listen_uri, &args->start)) {
> > > +        return -1;
> > > +    }
> > > +
> > > +    migrate_set_parameter_int(from, "x-checkpoint-delay", 300);
> > > +
> > > +    if (args->start_hook) {
> > > +        data_hook = args->start_hook(from, to);
> > > +    }
> > > +
> > > +    migrate_ensure_converge(from);
> > > +    wait_for_serial("src_serial");
> > > +
> > > +    migrate_qmp(from, to, args->connect_uri, NULL, "{}");
> > > +
> > > +    wait_for_migration_status(from, "colo", NULL);
> > > +    wait_for_resume(to, &dst_state);    
> > 
> > We can move this whole function into colo-tests.c.  Here you may want to
> > use get_dst() instead.  
> 
> Okey, will do that.
> 
> >   
> > > +
> > > +    wait_for_serial("src_serial");
> > > +    wait_for_serial("dest_serial");
> > > +
> > > +    /* wait for 3 checkpoints */
> > > +    for (int i = 0; i < 3; i++) {
> > > +        qtest_qmp_eventwait(to, "RESUME");
> > > +        wait_for_serial("src_serial");
> > > +        wait_for_serial("dest_serial");
> > > +    }
> > > +
> > > +    if (failover_during_checkpoint) {
> > > +        qtest_qmp_eventwait(to, "STOP");
> > > +    }
> > > +    if (primary_failover) {
> > > +        qtest_qmp_assert_success(from, "{'exec-oob': 'yank', 'id': 'yank-cmd', "
> > > +                                            "'arguments': {'instances':"
> > > +                                                "[{'type': 'migration'}]}}");
> > > +        qtest_qmp_assert_success(from, "{'execute': 'x-colo-lost-heartbeat'}");
> > > +        wait_for_serial("src_serial");
> > > +    } else {
> > > +        qtest_qmp_assert_success(to, "{'exec-oob': 'yank', 'id': 'yank-cmd', "
> > > +                                        "'arguments': {'instances':"
> > > +                                            "[{'type': 'migration'}]}}");
> > > +        qtest_qmp_assert_success(to, "{'execute': 'x-colo-lost-heartbeat'}");
> > > +        wait_for_serial("dest_serial");
> > > +    }
> > > +
> > > +    if (args->end_hook) {
> > > +        args->end_hook(from, to, data_hook);
> > > +    }
> > > +
> > > +    migrate_end(from, to, !primary_failover);
> > > +
> > > +    return 0;
> > > +}
> > > +
> > >  QTestMigrationState *get_src(void)
> > >  {
> > >      return &src_state;
> > > [...]

Re: [PATCH v2 5/8] migration-test: Add COLO migration unit test
Posted by Peter Xu 1 week, 4 days ago
On Sun, Jan 25, 2026 at 06:18:36PM +0100, Lukas Straub wrote:
> On Wed, 21 Jan 2026 20:37:51 +0100
> Lukas Straub <lukasstraub2@web.de> wrote:
> 
> > On Tue, 20 Jan 2026 12:23:08 -0500
> > Peter Xu <peterx@redhat.com> wrote:
> > 
> > > On Sat, Jan 17, 2026 at 03:09:12PM +0100, Lukas Straub wrote:  
> > > > Add a COLO migration test for COLO migration and failover.
> > > > 
> > > > COLO does not support q35 machine at this time.
> > > > 
> > > > [...]
> > > >  
> > > > +int test_colo_common(MigrateCommon *args, bool failover_during_checkpoint,
> > > > +                     bool primary_failover)
> > > > +{
> > > > +    QTestState *from, *to;
> > > > +    void *data_hook = NULL;
> > > > +
> > > > +    /*
> > > > +     * For the COLO test, both VMs will run in parallel. Thus both VMs want to
> > > > +     * open the image read/write at the same time. Using read-only=on is not
> > > > +     * possible here, because ide-hd does not support read-only backing image.
> > > > +     *
> > > > +     * So use -snapshot, where each qemu instance creates its own writable
> > > > +     * snapshot internally while leaving the real image read-only.
> > > > +     */
> > > > +    args->start.opts_source = "-snapshot";
> > > > +    args->start.opts_target = "-snapshot";
> > > > +
> > > > +    /*
> > > > +     * COLO migration code logs many errors when the migration socket
> > > > +     * is shut down, these are expected so we hide them here.
> > > > +     */
> > > > +    args->start.hide_stderr = true;
> > > > +
> > > > +    /*
> > > > +     * COLO currently does not work with Q35 machine
> > > > +     */
> > > > +    args->start.force_pc_machine = true;
> > > > +
> > > > +    args->start.oob = true;    
> > > 
> > > Just curious: is OOB required in COLO for some reason?  I understand yank
> > > you used below uses OOB, so the question is behind that, on what can be
> > > blocked in main thread, and special in COLO.
> 
> There is a lot that can hang:
> The netfilters all run on the main loop and use blocking write.
> fiter-mirror on the primary side mirrors packets to the secondary and
> can hang.
> filter-redirect on the secondary side redirects packets to primary's
> colo-compare and can hang.
> The nbd client on the primary side that is connected to the nbd server
> on the secondary side can hang. Especially during vm_stop() which fluses
> all inflight block io with BQL held.

None of them are used in this unit test, right?

I agree if OOB is needed in production we should also enable it in the unit
tests.  Said that, would you please add a comment into the test case
explaining this?  E.g. what can fail in reality, and why we still test OOB
(because we want to get as close to production COLO use case as possible).

Thanks,

-- 
Peter Xu