[v3] migration: Add COLO multifd support and COLO migration unit test

[PATCH v3 04/10] multifd: Add COLO support

Posted by Lukas Straub 2 weeks, 1 day ago

Like in the normal ram_load() path, put the received pages into the
colo cache and mark the pages in the bitmap so that they will be
flushed to the guest later.

Multifd with COLO is useful to reduce the VM pause time during checkpointing
for latency sensitive workloads. In such workloads the worst-case latency
is especially important.

Also, this is already worth it for the precopy phase as it helps with
converging. Moreover, multifd migration is the preferred way to do migration
nowadays and this allows to use multifd compression with COLO.

Benchmark:
Cluster nodes
 - Intel Xenon E5-2630 v3
 - 48Gb RAM
 - 10G Ethernet
Guest
 - Windows Server 2016
 - 6Gb RAM
 - 4 cores
Workload
 - Upload a file to the guest with SMB to simulate moderate
   memory dirtying
 - Measure the memory transfer time portion of each checkpoint
 - 600ms COLO checkpoint interval

Results
Plain
 idle mean: 4.50ms 99per: 10.33ms
 load mean: 24.30ms 99per: 78.05ms
Multifd-4
 idle mean: 6.48ms 99per: 10.41ms
 load mean: 14.12ms 99per: 31.27ms

Evaluation
While multifd has slightly higher latency when the guest idles, it is
10ms faster under load and more importantly it's worst case latency is
less than 1/2 of plain under load as can be seen in the 99. Percentile.

Signed-off-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Lukas Straub <lukasstraub2@web.de>
---
 MAINTAINERS                |  1 +
 migration/meson.build      |  2 +-
 migration/multifd-colo.c   | 50 ++++++++++++++++++++++++++++++++++++++++++++++
 migration/multifd-colo.h   | 26 ++++++++++++++++++++++++
 migration/multifd-nocomp.c | 10 +++++++++-
 migration/multifd.c        |  8 ++++++++
 migration/multifd.h        |  5 ++++-
 7 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1e9bdd87c3a2f84f3abfc56986cd793976810fdd..883f0a8f4eb92d0bf0f89fcab4674ccc4aed1cc1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3853,6 +3853,7 @@ COLO Framework
 M: Lukas Straub <lukasstraub2@web.de>
 S: Maintained
 F: migration/colo*
+F: migration/multifd-colo.*
 F: include/migration/colo.h
 F: include/migration/failover.h
 F: docs/COLO-FT.txt
diff --git a/migration/meson.build b/migration/meson.build
index c7f39bdb55239ecb0e775c77b90a1aa9e6a4a9ce..c9f0f5f9f2137536497e53e960ce70654ad1b394 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -39,7 +39,7 @@ system_ss.add(files(
 ), gnutls, zlib)
 
 if get_option('replication').allowed()
-  system_ss.add(files('colo-failover.c', 'colo.c'))
+  system_ss.add(files('colo-failover.c', 'colo.c', 'multifd-colo.c'))
 else
   system_ss.add(files('colo-stubs.c'))
 endif
diff --git a/migration/multifd-colo.c b/migration/multifd-colo.c
new file mode 100644
index 0000000000000000000000000000000000000000..c47f5044663969e0c9af56da5ec34902d635810a
--- /dev/null
+++ b/migration/multifd-colo.c
@@ -0,0 +1,50 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * multifd colo implementation
+ *
+ * Copyright (c) Lukas Straub <lukasstraub2@web.de>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "exec/target_page.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "ram.h"
+#include "multifd.h"
+#include "options.h"
+#include "io/channel-socket.h"
+#include "migration/colo.h"
+#include "multifd-colo.h"
+#include "system/ramblock.h"
+
+void multifd_colo_prepare_recv(MultiFDRecvParams *p)
+{
+    /*
+     * While we're still in precopy state (not yet in colo state), we copy
+     * received pages to both guest and cache. No need to set dirty bits,
+     * since guest and cache memory are in sync.
+     */
+    if (migration_incoming_in_colo_state()) {
+        colo_record_bitmap(p->block, p->normal, p->normal_num);
+        colo_record_bitmap(p->block, p->zero, p->zero_num);
+    }
+}
+
+void multifd_colo_process_recv(MultiFDRecvParams *p)
+{
+    if (!migration_incoming_in_colo_state()) {
+        for (int i = 0; i < p->normal_num; i++) {
+            void *guest = p->block->host + p->normal[i];
+            void *cache = p->host + p->normal[i];
+            memcpy(guest, cache, multifd_ram_page_size());
+        }
+        for (int i = 0; i < p->zero_num; i++) {
+            void *guest = p->block->host + p->zero[i];
+            memset(guest, 0, multifd_ram_page_size());
+        }
+    }
+}
diff --git a/migration/multifd-colo.h b/migration/multifd-colo.h
new file mode 100644
index 0000000000000000000000000000000000000000..82eaf3f48c47de2f090f9de52f9d57a337d4754a
--- /dev/null
+++ b/migration/multifd-colo.h
@@ -0,0 +1,26 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * multifd colo header
+ *
+ * Copyright (c) Lukas Straub <lukasstraub2@web.de>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_MIGRATION_MULTIFD_COLO_H
+#define QEMU_MIGRATION_MULTIFD_COLO_H
+
+#ifdef CONFIG_REPLICATION
+
+void multifd_colo_prepare_recv(MultiFDRecvParams *p);
+void multifd_colo_process_recv(MultiFDRecvParams *p);
+
+#else
+
+static inline void multifd_colo_prepare_recv(MultiFDRecvParams *p) {}
+static inline void multifd_colo_process_recv(MultiFDRecvParams *p) {}
+
+#endif
+#endif
diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
index 9be79b3b8e00371ebff9e112766c225bec260bf7..9f7a792fa761b3bc30b971b35f464103a61787f0 100644
--- a/migration/multifd-nocomp.c
+++ b/migration/multifd-nocomp.c
@@ -16,6 +16,7 @@
 #include "file.h"
 #include "migration-stats.h"
 #include "multifd.h"
+#include "multifd-colo.h"
 #include "options.h"
 #include "migration.h"
 #include "qapi/error.h"
@@ -269,7 +270,6 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
         return -1;
     }
 
-    p->host = p->block->host;
     for (i = 0; i < p->normal_num; i++) {
         uint64_t offset = be64_to_cpu(packet->offset[i]);
 
@@ -294,6 +294,14 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
         p->zero[i] = offset;
     }
 
+    if (migrate_colo()) {
+        multifd_colo_prepare_recv(p);
+        assert(p->block->colo_cache);
+        p->host = p->block->colo_cache;
+    } else {
+        p->host = p->block->host;
+    }
+
     return 0;
 }
 
diff --git a/migration/multifd.c b/migration/multifd.c
index 332e6fc58053462419f3171f6c320ac37648ef7b..220ed8564960fdabc58e4baa069dd252c8ad293c 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -29,6 +29,7 @@
 #include "qemu-file.h"
 #include "trace.h"
 #include "multifd.h"
+#include "multifd-colo.h"
 #include "options.h"
 #include "qemu/yank.h"
 #include "io/channel-file.h"
@@ -1258,6 +1259,13 @@ static int multifd_ram_state_recv(MultiFDRecvParams *p, Error **errp)
     int ret;
 
     ret = multifd_recv_state->ops->recv(p, errp);
+    if (ret != 0) {
+        return ret;
+    }
+
+    if (migrate_colo()) {
+        multifd_colo_process_recv(p);
+    }
 
     return ret;
 }
diff --git a/migration/multifd.h b/migration/multifd.h
index 89a395aef2b09a6762c45b5361e0ab63256feff6..fbc35702b062fdc3213ce92baed35994f5967c2b 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -279,7 +279,10 @@ typedef struct {
     uint64_t packets_recved;
     /* ramblock */
     RAMBlock *block;
-    /* ramblock host address */
+    /*
+     * Normally, it points to ramblock's host address.  When COLO
+     * is enabled, it points to the mirror cache for the ramblock.
+     */
     uint8_t *host;
     /* buffers to recv */
     struct iovec *iov;

-- 
2.39.5

Re: [PATCH v3 04/10] multifd: Add COLO support

Posted by Fabiano Rosas 2 weeks ago

Lukas Straub <lukasstraub2@web.de> writes:

> Like in the normal ram_load() path, put the received pages into the
> colo cache and mark the pages in the bitmap so that they will be
> flushed to the guest later.
>



> Multifd with COLO is useful to reduce the VM pause time during checkpointing
> for latency sensitive workloads. In such workloads the worst-case latency
> is especially important.
>
> Also, this is already worth it for the precopy phase as it helps with
> converging. Moreover, multifd migration is the preferred way to do migration
> nowadays and this allows to use multifd compression with COLO.
>
> Benchmark:
> Cluster nodes
>  - Intel Xenon E5-2630 v3
>  - 48Gb RAM
>  - 10G Ethernet
> Guest
>  - Windows Server 2016
>  - 6Gb RAM
>  - 4 cores
> Workload
>  - Upload a file to the guest with SMB to simulate moderate
>    memory dirtying
>  - Measure the memory transfer time portion of each checkpoint
>  - 600ms COLO checkpoint interval
>
> Results
> Plain
>  idle mean: 4.50ms 99per: 10.33ms
>  load mean: 24.30ms 99per: 78.05ms
> Multifd-4
>  idle mean: 6.48ms 99per: 10.41ms
>  load mean: 14.12ms 99per: 31.27ms
>
> Evaluation
> While multifd has slightly higher latency when the guest idles, it is
> 10ms faster under load and more importantly it's worst case latency is
> less than 1/2 of plain under load as can be seen in the 99. Percentile.
>
> Signed-off-by: Juan Quintela <quintela@redhat.com>
> Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> ---
>  MAINTAINERS                |  1 +
>  migration/meson.build      |  2 +-
>  migration/multifd-colo.c   | 50 ++++++++++++++++++++++++++++++++++++++++++++++
>  migration/multifd-colo.h   | 26 ++++++++++++++++++++++++
>  migration/multifd-nocomp.c | 10 +++++++++-
>  migration/multifd.c        |  8 ++++++++
>  migration/multifd.h        |  5 ++++-
>  7 files changed, 99 insertions(+), 3 deletions(-)
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1e9bdd87c3a2f84f3abfc56986cd793976810fdd..883f0a8f4eb92d0bf0f89fcab4674ccc4aed1cc1 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3853,6 +3853,7 @@ COLO Framework
>  M: Lukas Straub <lukasstraub2@web.de>
>  S: Maintained
>  F: migration/colo*
> +F: migration/multifd-colo.*
>  F: include/migration/colo.h
>  F: include/migration/failover.h
>  F: docs/COLO-FT.txt
> diff --git a/migration/meson.build b/migration/meson.build
> index c7f39bdb55239ecb0e775c77b90a1aa9e6a4a9ce..c9f0f5f9f2137536497e53e960ce70654ad1b394 100644
> --- a/migration/meson.build
> +++ b/migration/meson.build
> @@ -39,7 +39,7 @@ system_ss.add(files(
>  ), gnutls, zlib)
>  
>  if get_option('replication').allowed()
> -  system_ss.add(files('colo-failover.c', 'colo.c'))
> +  system_ss.add(files('colo-failover.c', 'colo.c', 'multifd-colo.c'))
>  else
>    system_ss.add(files('colo-stubs.c'))
>  endif
> diff --git a/migration/multifd-colo.c b/migration/multifd-colo.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..c47f5044663969e0c9af56da5ec34902d635810a
> --- /dev/null
> +++ b/migration/multifd-colo.c
> @@ -0,0 +1,50 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * multifd colo implementation
> + *
> + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "exec/target_page.h"
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
> +#include "ram.h"
> +#include "multifd.h"
> +#include "options.h"
> +#include "io/channel-socket.h"
> +#include "migration/colo.h"
> +#include "multifd-colo.h"
> +#include "system/ramblock.h"
> +
> +void multifd_colo_prepare_recv(MultiFDRecvParams *p)
> +{
> +    /*
> +     * While we're still in precopy state (not yet in colo state), we copy
> +     * received pages to both guest and cache. No need to set dirty bits,
> +     * since guest and cache memory are in sync.
> +     */
> +    if (migration_incoming_in_colo_state()) {

What's the relationship between migration_incoming_in_colo_state() and
migration_incoming_colo_enabled()? ram_load_precopy() checks both. Would
migration_incoming_colo_enabled affect multifd as well?

The multifd recv threads will be running until after
process_incoming_migration_bh(), which is when
migration_incoming_disable_colo() runs.

Also, is the colo_cache guaranteed to be around until multifd threads
exit?

> +        colo_record_bitmap(p->block, p->normal, p->normal_num);
> +        colo_record_bitmap(p->block, p->zero, p->zero_num);
> +    }
> +}
> +
> +void multifd_colo_process_recv(MultiFDRecvParams *p)
> +{
> +    if (!migration_incoming_in_colo_state()) {
> +        for (int i = 0; i < p->normal_num; i++) {
> +            void *guest = p->block->host + p->normal[i];
> +            void *cache = p->host + p->normal[i];
> +            memcpy(guest, cache, multifd_ram_page_size());
> +        }

I see some differences between what ram.c does and what multifd will do
after this patch regarding which flags are checked and order of copies
(code below):

ram.c:

  - migration_incoming_colo_enabled && migration_incoming_in_colo_state:
  Reads from stream into colo_cache.
  
  - migration_incoming_colo_enabled && !migration_incoming_in_colo_state:
  Reads from stream into guest and then memcpy into colo_cache.

  - !migration_incoming_colo_enabled
  Reads from stream into guest.

multifd.c:

  - migrate_colo:
  Reads from stream into colo_cache.
  
  - !migration_incoming_in_colo_state:
  memcpy from colo_cache into guest.

  - !migration_incoming_colo_enabled
  ???

The resulting state should be the same, but I wonder if we want to i) use
the same checks in multifd and ii) when not in colo state, copy first
into guest (using readv) and later memcpy into the colo_cache.

---
ram.c:

host = host_from_ram_block_offset(block, addr);
if (migration_incoming_colo_enabled()) {
    if (migration_incoming_in_colo_state()) {
        host = colo_cache_from_block_offset(block, addr, true);
    } else {
        host_bak = colo_cache_from_block_offset(block, addr, false);
    }
}
qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
if (host_bak) {
    memcpy(host_bak, host, TARGET_PAGE_SIZE);
}

multifd:

if (migrate_colo()) {
    p->host = p->block->colo_cache;
}

for (int i = 0; i < p->normal_num; i++) {
    p->iov[i].iov_base = p->host + p->normal[i];
}
return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);

if (!migration_incoming_in_colo_state()) {
    for (int i = 0; i < p->normal_num; i++) {
        void *guest = p->block->host + p->normal[i];
        void *cache = p->host + p->normal[i];
        memcpy(guest, cache, multifd_ram_page_size());
    }
}
---

> +        for (int i = 0; i < p->zero_num; i++) {
> +            void *guest = p->block->host + p->zero[i];
> +            memset(guest, 0, multifd_ram_page_size());
> +        }

At multifd_nocomp_recv, there will be a call to
multifd_recv_zero_page_process(), which by that point will have p->host
== p->block->colo_cache, so it looks like that function will do some
zero page processing in the colo_cache, setting the rb->receivedmap for
pages in the colo_cache and potentially also doing a memcpy. Is this
intended?

I'm thinking that maybe it would overall be better to hook colo directly
in to multifd_nocomp_recv:

static int multifd_nocomp_recv(MultiFDRecvParams *p, Error **errp)
{
    uint32_t flags;

    if (migrate_mapped_ram()) {
        return multifd_file_recv_data(p, errp);
    }

    flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;

    if (flags != MULTIFD_FLAG_NOCOMP) {
        error_setg(errp, "multifd %u: flags received %x flags expected %x",
                   p->id, flags, MULTIFD_FLAG_NOCOMP);
        return -1;
    }

+    if (migration_incoming_colo_enabled() && migration_incoming_in_colo_state()) {
+        p->host = p->block->colo_cache;
+    } // or else{}, depending on how deal with zero pages in the cache

    multifd_recv_zero_page_process(p);

    if (!p->normal_num) {
        return 0;
    }

    for (int i = 0; i < p->normal_num; i++) {
        p->iov[i].iov_base = p->host + p->normal[i];
        p->iov[i].iov_len = multifd_ram_page_size();
        ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
    }
+    ret = qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
+    if (ret != 0) {
+        return ret;
+    }
+
+    if (migration_incoming_colo_enabled()) {
+        multifd_colo_process_recv();
+    }

    return ret;
}


> +    }
> +}
> diff --git a/migration/multifd-colo.h b/migration/multifd-colo.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..82eaf3f48c47de2f090f9de52f9d57a337d4754a
> --- /dev/null
> +++ b/migration/multifd-colo.h
> @@ -0,0 +1,26 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * multifd colo header
> + *
> + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#ifndef QEMU_MIGRATION_MULTIFD_COLO_H
> +#define QEMU_MIGRATION_MULTIFD_COLO_H
> +
> +#ifdef CONFIG_REPLICATION
> +
> +void multifd_colo_prepare_recv(MultiFDRecvParams *p);
> +void multifd_colo_process_recv(MultiFDRecvParams *p);
> +
> +#else
> +
> +static inline void multifd_colo_prepare_recv(MultiFDRecvParams *p) {}
> +static inline void multifd_colo_process_recv(MultiFDRecvParams *p) {}
> +
> +#endif
> +#endif
> diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
> index 9be79b3b8e00371ebff9e112766c225bec260bf7..9f7a792fa761b3bc30b971b35f464103a61787f0 100644
> --- a/migration/multifd-nocomp.c
> +++ b/migration/multifd-nocomp.c
> @@ -16,6 +16,7 @@
>  #include "file.h"
>  #include "migration-stats.h"
>  #include "multifd.h"
> +#include "multifd-colo.h"
>  #include "options.h"
>  #include "migration.h"
>  #include "qapi/error.h"
> @@ -269,7 +270,6 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
>          return -1;
>      }
>  
> -    p->host = p->block->host;
>      for (i = 0; i < p->normal_num; i++) {
>          uint64_t offset = be64_to_cpu(packet->offset[i]);
>  
> @@ -294,6 +294,14 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
>          p->zero[i] = offset;
>      }
>  
> +    if (migrate_colo()) {
> +        multifd_colo_prepare_recv(p);
> +        assert(p->block->colo_cache);
> +        p->host = p->block->colo_cache;

Can't you just use p->block->colo_cache later? I don't see why p->host
needs to be set beforehand even in the non-colo case.

> +    } else {
> +        p->host = p->block->host;
> +    }
> +
>      return 0;
>  }
>  
> diff --git a/migration/multifd.c b/migration/multifd.c
> index 332e6fc58053462419f3171f6c320ac37648ef7b..220ed8564960fdabc58e4baa069dd252c8ad293c 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -29,6 +29,7 @@
>  #include "qemu-file.h"
>  #include "trace.h"
>  #include "multifd.h"
> +#include "multifd-colo.h"
>  #include "options.h"
>  #include "qemu/yank.h"
>  #include "io/channel-file.h"
> @@ -1258,6 +1259,13 @@ static int multifd_ram_state_recv(MultiFDRecvParams *p, Error **errp)
>      int ret;
>  
>      ret = multifd_recv_state->ops->recv(p, errp);
> +    if (ret != 0) {
> +        return ret;
> +    }
> +
> +    if (migrate_colo()) {
> +        multifd_colo_process_recv(p);
> +    }

Either put all of colo hooks in multifd.c or multifd-nocomp.c. I think
the latter is more appropriate as we have mapped_ram already in
there. Let's drop patch 3 and put this in multifd_nocomp_recv().

>  
>      return ret;
>  }
> diff --git a/migration/multifd.h b/migration/multifd.h
> index 89a395aef2b09a6762c45b5361e0ab63256feff6..fbc35702b062fdc3213ce92baed35994f5967c2b 100644
> --- a/migration/multifd.h
> +++ b/migration/multifd.h
> @@ -279,7 +279,10 @@ typedef struct {
>      uint64_t packets_recved;
>      /* ramblock */
>      RAMBlock *block;
> -    /* ramblock host address */
> +    /*
> +     * Normally, it points to ramblock's host address.  When COLO
> +     * is enabled, it points to the mirror cache for the ramblock.
> +     */
>      uint8_t *host;
>      /* buffers to recv */
>      struct iovec *iov;

Re: [PATCH v3 04/10] multifd: Add COLO support

Posted by Lukas Straub 2 weeks ago

On Mon, 26 Jan 2026 11:33:13 -0300
Fabiano Rosas <farosas@suse.de> wrote:

> Lukas Straub <lukasstraub2@web.de> writes:
> 
> > Like in the normal ram_load() path, put the received pages into the
> > colo cache and mark the pages in the bitmap so that they will be
> > flushed to the guest later.
> >  
> 
> 
> 
> > Multifd with COLO is useful to reduce the VM pause time during checkpointing
> > for latency sensitive workloads. In such workloads the worst-case latency
> > is especially important.
> >
> > Also, this is already worth it for the precopy phase as it helps with
> > converging. Moreover, multifd migration is the preferred way to do migration
> > nowadays and this allows to use multifd compression with COLO.
> >
> > Benchmark:
> > Cluster nodes
> >  - Intel Xenon E5-2630 v3
> >  - 48Gb RAM
> >  - 10G Ethernet
> > Guest
> >  - Windows Server 2016
> >  - 6Gb RAM
> >  - 4 cores
> > Workload
> >  - Upload a file to the guest with SMB to simulate moderate
> >    memory dirtying
> >  - Measure the memory transfer time portion of each checkpoint
> >  - 600ms COLO checkpoint interval
> >
> > Results
> > Plain
> >  idle mean: 4.50ms 99per: 10.33ms
> >  load mean: 24.30ms 99per: 78.05ms
> > Multifd-4
> >  idle mean: 6.48ms 99per: 10.41ms
> >  load mean: 14.12ms 99per: 31.27ms
> >
> > Evaluation
> > While multifd has slightly higher latency when the guest idles, it is
> > 10ms faster under load and more importantly it's worst case latency is
> > less than 1/2 of plain under load as can be seen in the 99. Percentile.
> >
> > Signed-off-by: Juan Quintela <quintela@redhat.com>
> > Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> > ---
> >  MAINTAINERS                |  1 +
> >  migration/meson.build      |  2 +-
> >  migration/multifd-colo.c   | 50 ++++++++++++++++++++++++++++++++++++++++++++++
> >  migration/multifd-colo.h   | 26 ++++++++++++++++++++++++
> >  migration/multifd-nocomp.c | 10 +++++++++-
> >  migration/multifd.c        |  8 ++++++++
> >  migration/multifd.h        |  5 ++++-
> >  7 files changed, 99 insertions(+), 3 deletions(-)
> >
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index 1e9bdd87c3a2f84f3abfc56986cd793976810fdd..883f0a8f4eb92d0bf0f89fcab4674ccc4aed1cc1 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -3853,6 +3853,7 @@ COLO Framework
> >  M: Lukas Straub <lukasstraub2@web.de>
> >  S: Maintained
> >  F: migration/colo*
> > +F: migration/multifd-colo.*
> >  F: include/migration/colo.h
> >  F: include/migration/failover.h
> >  F: docs/COLO-FT.txt
> > diff --git a/migration/meson.build b/migration/meson.build
> > index c7f39bdb55239ecb0e775c77b90a1aa9e6a4a9ce..c9f0f5f9f2137536497e53e960ce70654ad1b394 100644
> > --- a/migration/meson.build
> > +++ b/migration/meson.build
> > @@ -39,7 +39,7 @@ system_ss.add(files(
> >  ), gnutls, zlib)
> >  
> >  if get_option('replication').allowed()
> > -  system_ss.add(files('colo-failover.c', 'colo.c'))
> > +  system_ss.add(files('colo-failover.c', 'colo.c', 'multifd-colo.c'))
> >  else
> >    system_ss.add(files('colo-stubs.c'))
> >  endif
> > diff --git a/migration/multifd-colo.c b/migration/multifd-colo.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..c47f5044663969e0c9af56da5ec34902d635810a
> > --- /dev/null
> > +++ b/migration/multifd-colo.c
> > @@ -0,0 +1,50 @@
> > +/*
> > + * SPDX-License-Identifier: GPL-2.0-or-later
> > + *
> > + * multifd colo implementation
> > + *
> > + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "exec/target_page.h"
> > +#include "qemu/error-report.h"
> > +#include "qapi/error.h"
> > +#include "ram.h"
> > +#include "multifd.h"
> > +#include "options.h"
> > +#include "io/channel-socket.h"
> > +#include "migration/colo.h"
> > +#include "multifd-colo.h"
> > +#include "system/ramblock.h"
> > +
> > +void multifd_colo_prepare_recv(MultiFDRecvParams *p)
> > +{
> > +    /*
> > +     * While we're still in precopy state (not yet in colo state), we copy
> > +     * received pages to both guest and cache. No need to set dirty bits,
> > +     * since guest and cache memory are in sync.
> > +     */
> > +    if (migration_incoming_in_colo_state()) {  
> 
> What's the relationship between migration_incoming_in_colo_state() and
> migration_incoming_colo_enabled()? ram_load_precopy() checks both. Would
> migration_incoming_colo_enabled affect multifd as well?

So first off migration_incoming_colo_enabled() and migrate_colo()
are equivalent in practice since
121ccedc2b migration: block incoming colo when capability is disabled

(I have some cleanup patches lying around, but that will be for later)

For colo, we do normal precopy migration and at the end we go into colo
state and then
migration_incoming_in_colo_state() will be true.

Here we check migrate_colo() outside of these functions as Peter
requested that in a previous version.

> 
> The multifd recv threads will be running until after
> process_incoming_migration_bh(), which is when
> migration_incoming_disable_colo() runs.

That is not an issue as we use migrate_colo() here.

> 
> Also, is the colo_cache guaranteed to be around until multifd threads
> exit?

This is an issue. I will fix it in the next version.

>
> > +        colo_record_bitmap(p->block, p->normal, p->normal_num);
> > +        colo_record_bitmap(p->block, p->zero, p->zero_num);
> > +    }
> > +}
> > +
> > +void multifd_colo_process_recv(MultiFDRecvParams *p)
> > +{
> > +    if (!migration_incoming_in_colo_state()) {
> > +        for (int i = 0; i < p->normal_num; i++) {
> > +            void *guest = p->block->host + p->normal[i];
> > +            void *cache = p->host + p->normal[i];
> > +            memcpy(guest, cache, multifd_ram_page_size());
> > +        }  
> 
> I see some differences between what ram.c does and what multifd will do
> after this patch regarding which flags are checked and order of copies
> (code below):
> 
> ram.c:
> 
>   - migration_incoming_colo_enabled && migration_incoming_in_colo_state:
>   Reads from stream into colo_cache.
>   
>   - migration_incoming_colo_enabled && !migration_incoming_in_colo_state:
>   Reads from stream into guest and then memcpy into colo_cache.
> 
>   - !migration_incoming_colo_enabled
>   Reads from stream into guest.
> 
> multifd.c:
> 
>   - migrate_colo:
>   Reads from stream into colo_cache.
>   
>   - !migration_incoming_in_colo_state:
>   memcpy from colo_cache into guest.
> 
>   - !migration_incoming_colo_enabled
>   ???
> 
> The resulting state should be the same, but I wonder if we want to i) use
> the same checks in multifd

migration_incoming_colo_enabled() shouldn't even exist anymore, so I'm
not using it here. migrate_colo() is much easier to reason about.

> and ii) when not in colo state, copy first
> into guest (using readv) and later memcpy into the colo_cache.

I think it is easier the way it is now.

> 
> ---
> ram.c:
> 
> host = host_from_ram_block_offset(block, addr);
> if (migration_incoming_colo_enabled()) {
>     if (migration_incoming_in_colo_state()) {
>         host = colo_cache_from_block_offset(block, addr, true);
>     } else {
>         host_bak = colo_cache_from_block_offset(block, addr, false);
>     }
> }
> qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
> if (host_bak) {
>     memcpy(host_bak, host, TARGET_PAGE_SIZE);
> }
> 
> multifd:
> 
> if (migrate_colo()) {
>     p->host = p->block->colo_cache;
> }
> 
> for (int i = 0; i < p->normal_num; i++) {
>     p->iov[i].iov_base = p->host + p->normal[i];
> }
> return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
> 
> if (!migration_incoming_in_colo_state()) {
>     for (int i = 0; i < p->normal_num; i++) {
>         void *guest = p->block->host + p->normal[i];
>         void *cache = p->host + p->normal[i];
>         memcpy(guest, cache, multifd_ram_page_size());
>     }
> }
> ---
> 
> > +        for (int i = 0; i < p->zero_num; i++) {
> > +            void *guest = p->block->host + p->zero[i];
> > +            memset(guest, 0, multifd_ram_page_size());
> > +        }  
> 
> At multifd_nocomp_recv, there will be a call to
> multifd_recv_zero_page_process(), which by that point will have p->host
> == p->block->colo_cache, so it looks like that function will do some
> zero page processing in the colo_cache, setting the rb->receivedmap for
> pages in the colo_cache and potentially also doing a memcpy. Is this
> intended?

rb->receivedmap is only for postcopy, right? So it doesn't apply with
colo.

> 
> I'm thinking that maybe it would overall be better to hook colo directly
> in to multifd_nocomp_recv:

But then it will only work for nocomp, right? It feels like the wrong
level of abstraction to me.

> 
> static int multifd_nocomp_recv(MultiFDRecvParams *p, Error **errp)
> {
>     uint32_t flags;
> 
>     if (migrate_mapped_ram()) {
>         return multifd_file_recv_data(p, errp);
>     }
> 
>     flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
> 
>     if (flags != MULTIFD_FLAG_NOCOMP) {
>         error_setg(errp, "multifd %u: flags received %x flags expected %x",
>                    p->id, flags, MULTIFD_FLAG_NOCOMP);
>         return -1;
>     }
> 
> +    if (migration_incoming_colo_enabled() && migration_incoming_in_colo_state()) {
> +        p->host = p->block->colo_cache;
> +    } // or else{}, depending on how deal with zero pages in the cache
> 
>     multifd_recv_zero_page_process(p);
> 
>     if (!p->normal_num) {
>         return 0;
>     }
> 
>     for (int i = 0; i < p->normal_num; i++) {
>         p->iov[i].iov_base = p->host + p->normal[i];
>         p->iov[i].iov_len = multifd_ram_page_size();
>         ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
>     }
> +    ret = qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
> +    if (ret != 0) {
> +        return ret;
> +    }
> +
> +    if (migration_incoming_colo_enabled()) {
> +        multifd_colo_process_recv();
> +    }
> 
>     return ret;
> }
> 
> 
> > +    }
> > +}
> > diff --git a/migration/multifd-colo.h b/migration/multifd-colo.h
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..82eaf3f48c47de2f090f9de52f9d57a337d4754a
> > --- /dev/null
> > +++ b/migration/multifd-colo.h
> > @@ -0,0 +1,26 @@
> > +/*
> > + * SPDX-License-Identifier: GPL-2.0-or-later
> > + *
> > + * multifd colo header
> > + *
> > + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +
> > +#ifndef QEMU_MIGRATION_MULTIFD_COLO_H
> > +#define QEMU_MIGRATION_MULTIFD_COLO_H
> > +
> > +#ifdef CONFIG_REPLICATION
> > +
> > +void multifd_colo_prepare_recv(MultiFDRecvParams *p);
> > +void multifd_colo_process_recv(MultiFDRecvParams *p);
> > +
> > +#else
> > +
> > +static inline void multifd_colo_prepare_recv(MultiFDRecvParams *p) {}
> > +static inline void multifd_colo_process_recv(MultiFDRecvParams *p) {}
> > +
> > +#endif
> > +#endif
> > diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
> > index 9be79b3b8e00371ebff9e112766c225bec260bf7..9f7a792fa761b3bc30b971b35f464103a61787f0 100644
> > --- a/migration/multifd-nocomp.c
> > +++ b/migration/multifd-nocomp.c
> > @@ -16,6 +16,7 @@
> >  #include "file.h"
> >  #include "migration-stats.h"
> >  #include "multifd.h"
> > +#include "multifd-colo.h"
> >  #include "options.h"
> >  #include "migration.h"
> >  #include "qapi/error.h"
> > @@ -269,7 +270,6 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
> >          return -1;
> >      }
> >  
> > -    p->host = p->block->host;
> >      for (i = 0; i < p->normal_num; i++) {
> >          uint64_t offset = be64_to_cpu(packet->offset[i]);
> >  
> > @@ -294,6 +294,14 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
> >          p->zero[i] = offset;
> >      }
> >  
> > +    if (migrate_colo()) {
> > +        multifd_colo_prepare_recv(p);
> > +        assert(p->block->colo_cache);
> > +        p->host = p->block->colo_cache;  
> 
> Can't you just use p->block->colo_cache later? I don't see why p->host
> needs to be set beforehand even in the non-colo case.

We should not touch the guest ram directly while in colo state, since
the incoming guest is running and we either want to receive and apply a
whole checkpoint with all ram into colo cache and all device state,
or if anything goes wrong during checkpointing, keep the currently
running guest on the incoming side in pristine state.

I have written more about colo migration here:

https://lore.kernel.org/qemu-devel/20260117204913.584e1829@penguin/
https://lore.kernel.org/qemu-devel/aXE1i9xJ81EWokYz@x1.local/

> 
> > +    } else {
> > +        p->host = p->block->host;
> > +    }
> > +
> >      return 0;
> >  }
> >  
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index 332e6fc58053462419f3171f6c320ac37648ef7b..220ed8564960fdabc58e4baa069dd252c8ad293c 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -29,6 +29,7 @@
> >  #include "qemu-file.h"
> >  #include "trace.h"
> >  #include "multifd.h"
> > +#include "multifd-colo.h"
> >  #include "options.h"
> >  #include "qemu/yank.h"
> >  #include "io/channel-file.h"
> > @@ -1258,6 +1259,13 @@ static int multifd_ram_state_recv(MultiFDRecvParams *p, Error **errp)
> >      int ret;
> >  
> >      ret = multifd_recv_state->ops->recv(p, errp);
> > +    if (ret != 0) {
> > +        return ret;
> > +    }
> > +
> > +    if (migrate_colo()) {
> > +        multifd_colo_process_recv(p);
> > +    }  
> 
> Either put all of colo hooks in multifd.c or multifd-nocomp.c. I think
> the latter is more appropriate as we have mapped_ram already in
> there. Let's drop patch 3 and put this in multifd_nocomp_recv().

Again, it also should work with compression and multifd_nocomp_recv()
is for nocomp only.

> 
> >  
> >      return ret;
> >  }
> > diff --git a/migration/multifd.h b/migration/multifd.h
> > index 89a395aef2b09a6762c45b5361e0ab63256feff6..fbc35702b062fdc3213ce92baed35994f5967c2b 100644
> > --- a/migration/multifd.h
> > +++ b/migration/multifd.h
> > @@ -279,7 +279,10 @@ typedef struct {
> >      uint64_t packets_recved;
> >      /* ramblock */
> >      RAMBlock *block;
> > -    /* ramblock host address */
> > +    /*
> > +     * Normally, it points to ramblock's host address.  When COLO
> > +     * is enabled, it points to the mirror cache for the ramblock.
> > +     */
> >      uint8_t *host;
> >      /* buffers to recv */
> >      struct iovec *iov;

Re: [PATCH v3 04/10] multifd: Add COLO support

Posted by Fabiano Rosas 2 weeks ago

Lukas Straub <lukasstraub2@web.de> writes:

>> > +void multifd_colo_prepare_recv(MultiFDRecvParams *p)
>> > +{
>> > +    /*
>> > +     * While we're still in precopy state (not yet in colo state), we copy
>> > +     * received pages to both guest and cache. No need to set dirty bits,
>> > +     * since guest and cache memory are in sync.
>> > +     */
>> > +    if (migration_incoming_in_colo_state()) {  
>> 
>> What's the relationship between migration_incoming_in_colo_state() and
>> migration_incoming_colo_enabled()? ram_load_precopy() checks both. Would
>> migration_incoming_colo_enabled affect multifd as well?
>
> So first off migration_incoming_colo_enabled() and migrate_colo()
> are equivalent in practice since
> 121ccedc2b migration: block incoming colo when capability is disabled
>
> (I have some cleanup patches lying around, but that will be for later)
>

Ok, I think those are important because when having multifd and
non-multifd code for the same feature, it's useful to be able to compare
the two. So some degree of uniformity would be nice.

> For colo, we do normal precopy migration and at the end we go into colo
> state and then
> migration_incoming_in_colo_state() will be true.
>
> Here we check migrate_colo() outside of these functions as Peter
> requested that in a previous version.
>
>> 
>> The multifd recv threads will be running until after
>> process_incoming_migration_bh(), which is when
>> migration_incoming_disable_colo() runs.
>
> That is not an issue as we use migrate_colo() here.
>
>> 
>> Also, is the colo_cache guaranteed to be around until multifd threads
>> exit?
>
> This is an issue. I will fix it in the next version.
>
>>
>> > +        colo_record_bitmap(p->block, p->normal, p->normal_num);
>> > +        colo_record_bitmap(p->block, p->zero, p->zero_num);
>> > +    }
>> > +}
>> > +
>> > +void multifd_colo_process_recv(MultiFDRecvParams *p)
>> > +{
>> > +    if (!migration_incoming_in_colo_state()) {
>> > +        for (int i = 0; i < p->normal_num; i++) {
>> > +            void *guest = p->block->host + p->normal[i];
>> > +            void *cache = p->host + p->normal[i];
>> > +            memcpy(guest, cache, multifd_ram_page_size());
>> > +        }  
>> 
>> I see some differences between what ram.c does and what multifd will do
>> after this patch regarding which flags are checked and order of copies
>> (code below):
>> 
>> ram.c:
>> 
>>   - migration_incoming_colo_enabled && migration_incoming_in_colo_state:
>>   Reads from stream into colo_cache.
>>   
>>   - migration_incoming_colo_enabled && !migration_incoming_in_colo_state:
>>   Reads from stream into guest and then memcpy into colo_cache.
>> 
>>   - !migration_incoming_colo_enabled
>>   Reads from stream into guest.
>> 
>> multifd.c:
>> 
>>   - migrate_colo:
>>   Reads from stream into colo_cache.
>>   
>>   - !migration_incoming_in_colo_state:
>>   memcpy from colo_cache into guest.
>> 
>>   - !migration_incoming_colo_enabled
>>   ???
>> 
>> The resulting state should be the same, but I wonder if we want to i) use
>> the same checks in multifd
>
> migration_incoming_colo_enabled() shouldn't even exist anymore, so I'm
> not using it here. migrate_colo() is much easier to reason about.
>
>> and ii) when not in colo state, copy first
>> into guest (using readv) and later memcpy into the colo_cache.
>
> I think it is easier the way it is now.
>
>> 
>> ---
>> ram.c:
>> 
>> host = host_from_ram_block_offset(block, addr);
>> if (migration_incoming_colo_enabled()) {
>>     if (migration_incoming_in_colo_state()) {
>>         host = colo_cache_from_block_offset(block, addr, true);
>>     } else {
>>         host_bak = colo_cache_from_block_offset(block, addr, false);
>>     }
>> }
>> qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
>> if (host_bak) {
>>     memcpy(host_bak, host, TARGET_PAGE_SIZE);
>> }
>> 
>> multifd:
>> 
>> if (migrate_colo()) {
>>     p->host = p->block->colo_cache;
>> }
>> 
>> for (int i = 0; i < p->normal_num; i++) {
>>     p->iov[i].iov_base = p->host + p->normal[i];
>> }
>> return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
>> 
>> if (!migration_incoming_in_colo_state()) {
>>     for (int i = 0; i < p->normal_num; i++) {
>>         void *guest = p->block->host + p->normal[i];
>>         void *cache = p->host + p->normal[i];
>>         memcpy(guest, cache, multifd_ram_page_size());
>>     }
>> }
>> ---
>> 
>> > +        for (int i = 0; i < p->zero_num; i++) {
>> > +            void *guest = p->block->host + p->zero[i];
>> > +            memset(guest, 0, multifd_ram_page_size());
>> > +        }  
>> 
>> At multifd_nocomp_recv, there will be a call to
>> multifd_recv_zero_page_process(), which by that point will have p->host
>> == p->block->colo_cache, so it looks like that function will do some
>> zero page processing in the colo_cache, setting the rb->receivedmap for
>> pages in the colo_cache and potentially also doing a memcpy. Is this
>> intended?
>
> rb->receivedmap is only for postcopy, right? So it doesn't apply with
> colo.
>

It's not anymore since commit 5ef7e26bdb ("migration/multifd: solve zero
page causing multiple page faults"). So it seems we might be doing extra
work on top of the colo_cache.

>> 
>> I'm thinking that maybe it would overall be better to hook colo directly
>> in to multifd_nocomp_recv:
>
> But then it will only work for nocomp, right? It feels like the wrong
> level of abstraction to me.
>

Ah, nocomp != ram indeed.

>> 
>> static int multifd_nocomp_recv(MultiFDRecvParams *p, Error **errp)
>> {
>>     uint32_t flags;
>> 
>>     if (migrate_mapped_ram()) {
>>         return multifd_file_recv_data(p, errp);
>>     }
>> 
>>     flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
>> 
>>     if (flags != MULTIFD_FLAG_NOCOMP) {
>>         error_setg(errp, "multifd %u: flags received %x flags expected %x",
>>                    p->id, flags, MULTIFD_FLAG_NOCOMP);
>>         return -1;
>>     }
>> 
>> +    if (migration_incoming_colo_enabled() && migration_incoming_in_colo_state()) {
>> +        p->host = p->block->colo_cache;
>> +    } // or else{}, depending on how deal with zero pages in the cache
>> 
>>     multifd_recv_zero_page_process(p);
>> 
>>     if (!p->normal_num) {
>>         return 0;
>>     }
>> 
>>     for (int i = 0; i < p->normal_num; i++) {
>>         p->iov[i].iov_base = p->host + p->normal[i];
>>         p->iov[i].iov_len = multifd_ram_page_size();
>>         ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
>>     }
>> +    ret = qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
>> +    if (ret != 0) {
>> +        return ret;
>> +    }
>> +
>> +    if (migration_incoming_colo_enabled()) {
>> +        multifd_colo_process_recv();
>> +    }
>> 
>>     return ret;
>> }
>> 
>> 
>> > +    }
>> > +}
>> > diff --git a/migration/multifd-colo.h b/migration/multifd-colo.h
>> > new file mode 100644
>> > index 0000000000000000000000000000000000000000..82eaf3f48c47de2f090f9de52f9d57a337d4754a
>> > --- /dev/null
>> > +++ b/migration/multifd-colo.h
>> > @@ -0,0 +1,26 @@
>> > +/*
>> > + * SPDX-License-Identifier: GPL-2.0-or-later
>> > + *
>> > + * multifd colo header
>> > + *
>> > + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
>> > + *
>> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> > + * See the COPYING file in the top-level directory.
>> > + */
>> > +
>> > +#ifndef QEMU_MIGRATION_MULTIFD_COLO_H
>> > +#define QEMU_MIGRATION_MULTIFD_COLO_H
>> > +
>> > +#ifdef CONFIG_REPLICATION
>> > +
>> > +void multifd_colo_prepare_recv(MultiFDRecvParams *p);
>> > +void multifd_colo_process_recv(MultiFDRecvParams *p);
>> > +
>> > +#else
>> > +
>> > +static inline void multifd_colo_prepare_recv(MultiFDRecvParams *p) {}
>> > +static inline void multifd_colo_process_recv(MultiFDRecvParams *p) {}
>> > +
>> > +#endif
>> > +#endif
>> > diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
>> > index 9be79b3b8e00371ebff9e112766c225bec260bf7..9f7a792fa761b3bc30b971b35f464103a61787f0 100644
>> > --- a/migration/multifd-nocomp.c
>> > +++ b/migration/multifd-nocomp.c
>> > @@ -16,6 +16,7 @@
>> >  #include "file.h"
>> >  #include "migration-stats.h"
>> >  #include "multifd.h"
>> > +#include "multifd-colo.h"
>> >  #include "options.h"
>> >  #include "migration.h"
>> >  #include "qapi/error.h"
>> > @@ -269,7 +270,6 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
>> >          return -1;
>> >      }
>> >  
>> > -    p->host = p->block->host;
>> >      for (i = 0; i < p->normal_num; i++) {
>> >          uint64_t offset = be64_to_cpu(packet->offset[i]);
>> >  
>> > @@ -294,6 +294,14 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
>> >          p->zero[i] = offset;
>> >      }
>> >  
>> > +    if (migrate_colo()) {
>> > +        multifd_colo_prepare_recv(p);
>> > +        assert(p->block->colo_cache);
>> > +        p->host = p->block->colo_cache;  
>> 
>> Can't you just use p->block->colo_cache later? I don't see why p->host
>> needs to be set beforehand even in the non-colo case.
>
> We should not touch the guest ram directly while in colo state, since
> the incoming guest is running and we either want to receive and apply a
> whole checkpoint with all ram into colo cache and all device state,
> or if anything goes wrong during checkpointing, keep the currently
> running guest on the incoming side in pristine state.
>

I was asking about setting p->host at this specific point. I don't think
any of this fits the unfill function. However, I see those were
suggested by Peter so let's not go back and forth.

> I have written more about colo migration here:
>
> https://lore.kernel.org/qemu-devel/20260117204913.584e1829@penguin/
> https://lore.kernel.org/qemu-devel/aXE1i9xJ81EWokYz@x1.local/
>
>> 
>> > +    } else {
>> > +        p->host = p->block->host;
>> > +    }
>> > +
>> >      return 0;
>> >  }
>> >  
>> > diff --git a/migration/multifd.c b/migration/multifd.c
>> > index 332e6fc58053462419f3171f6c320ac37648ef7b..220ed8564960fdabc58e4baa069dd252c8ad293c 100644
>> > --- a/migration/multifd.c
>> > +++ b/migration/multifd.c
>> > @@ -29,6 +29,7 @@
>> >  #include "qemu-file.h"
>> >  #include "trace.h"
>> >  #include "multifd.h"
>> > +#include "multifd-colo.h"
>> >  #include "options.h"
>> >  #include "qemu/yank.h"
>> >  #include "io/channel-file.h"
>> > @@ -1258,6 +1259,13 @@ static int multifd_ram_state_recv(MultiFDRecvParams *p, Error **errp)
>> >      int ret;
>> >  
>> >      ret = multifd_recv_state->ops->recv(p, errp);
>> > +    if (ret != 0) {
>> > +        return ret;
>> > +    }
>> > +
>> > +    if (migrate_colo()) {
>> > +        multifd_colo_process_recv(p);
>> > +    }  
>> 
>> Either put all of colo hooks in multifd.c or multifd-nocomp.c. I think
>> the latter is more appropriate as we have mapped_ram already in
>> there. Let's drop patch 3 and put this in multifd_nocomp_recv().
>
> Again, it also should work with compression and multifd_nocomp_recv()
> is for nocomp only.
>
>> 
>> >  
>> >      return ret;
>> >  }
>> > diff --git a/migration/multifd.h b/migration/multifd.h
>> > index 89a395aef2b09a6762c45b5361e0ab63256feff6..fbc35702b062fdc3213ce92baed35994f5967c2b 100644
>> > --- a/migration/multifd.h
>> > +++ b/migration/multifd.h
>> > @@ -279,7 +279,10 @@ typedef struct {
>> >      uint64_t packets_recved;
>> >      /* ramblock */
>> >      RAMBlock *block;
>> > -    /* ramblock host address */
>> > +    /*
>> > +     * Normally, it points to ramblock's host address.  When COLO
>> > +     * is enabled, it points to the mirror cache for the ramblock.
>> > +     */
>> >      uint8_t *host;
>> >      /* buffers to recv */
>> >      struct iovec *iov;

Re: [PATCH v3 04/10] multifd: Add COLO support

Posted by Peter Xu 1 week, 6 days ago

On Mon, Jan 26, 2026 at 06:37:31PM -0300, Fabiano Rosas wrote:
> Lukas Straub <lukasstraub2@web.de> writes:
> 
> >> > +void multifd_colo_prepare_recv(MultiFDRecvParams *p)
> >> > +{
> >> > +    /*
> >> > +     * While we're still in precopy state (not yet in colo state), we copy
> >> > +     * received pages to both guest and cache. No need to set dirty bits,
> >> > +     * since guest and cache memory are in sync.
> >> > +     */
> >> > +    if (migration_incoming_in_colo_state()) {  
> >> 
> >> What's the relationship between migration_incoming_in_colo_state() and
> >> migration_incoming_colo_enabled()? ram_load_precopy() checks both. Would
> >> migration_incoming_colo_enabled affect multifd as well?
> >
> > So first off migration_incoming_colo_enabled() and migrate_colo()
> > are equivalent in practice since
> > 121ccedc2b migration: block incoming colo when capability is disabled
> >
> > (I have some cleanup patches lying around, but that will be for later)
> >
> 
> Ok, I think those are important because when having multifd and
> non-multifd code for the same feature, it's useful to be able to compare
> the two. So some degree of uniformity would be nice.

I second.  We can drop those in this series before adding multifd support,
likely together with MIG_CMD_ENABLE_COLO as well; I don't think COLO needs
to worry about old binaries.  It should always use the same QEMU binary on
both sides.

The patch needs to rename MIG_CMD_ENABLE_COLO to MIG_CMD_DEPRECATED_0 or
something, to make the rest MIG_CMD compatible to old binaries, though.

> 
> > For colo, we do normal precopy migration and at the end we go into colo
> > state and then
> > migration_incoming_in_colo_state() will be true.
> >
> > Here we check migrate_colo() outside of these functions as Peter
> > requested that in a previous version.
> >
> >> 
> >> The multifd recv threads will be running until after
> >> process_incoming_migration_bh(), which is when
> >> migration_incoming_disable_colo() runs.
> >
> > That is not an issue as we use migrate_colo() here.
> >
> >> 
> >> Also, is the colo_cache guaranteed to be around until multifd threads
> >> exit?
> >
> > This is an issue. I will fix it in the next version.
> >
> >>
> >> > +        colo_record_bitmap(p->block, p->normal, p->normal_num);
> >> > +        colo_record_bitmap(p->block, p->zero, p->zero_num);
> >> > +    }
> >> > +}
> >> > +
> >> > +void multifd_colo_process_recv(MultiFDRecvParams *p)
> >> > +{
> >> > +    if (!migration_incoming_in_colo_state()) {
> >> > +        for (int i = 0; i < p->normal_num; i++) {
> >> > +            void *guest = p->block->host + p->normal[i];
> >> > +            void *cache = p->host + p->normal[i];
> >> > +            memcpy(guest, cache, multifd_ram_page_size());
> >> > +        }  
> >> 
> >> I see some differences between what ram.c does and what multifd will do
> >> after this patch regarding which flags are checked and order of copies
> >> (code below):
> >> 
> >> ram.c:
> >> 
> >>   - migration_incoming_colo_enabled && migration_incoming_in_colo_state:
> >>   Reads from stream into colo_cache.
> >>   
> >>   - migration_incoming_colo_enabled && !migration_incoming_in_colo_state:
> >>   Reads from stream into guest and then memcpy into colo_cache.
> >> 
> >>   - !migration_incoming_colo_enabled
> >>   Reads from stream into guest.
> >> 
> >> multifd.c:
> >> 
> >>   - migrate_colo:
> >>   Reads from stream into colo_cache.
> >>   
> >>   - !migration_incoming_in_colo_state:
> >>   memcpy from colo_cache into guest.
> >> 
> >>   - !migration_incoming_colo_enabled
> >>   ???
> >> 
> >> The resulting state should be the same, but I wonder if we want to i) use
> >> the same checks in multifd
> >
> > migration_incoming_colo_enabled() shouldn't even exist anymore, so I'm
> > not using it here. migrate_colo() is much easier to reason about.
> >
> >> and ii) when not in colo state, copy first
> >> into guest (using readv) and later memcpy into the colo_cache.
> >
> > I think it is easier the way it is now.
> >
> >> 
> >> ---
> >> ram.c:
> >> 
> >> host = host_from_ram_block_offset(block, addr);
> >> if (migration_incoming_colo_enabled()) {
> >>     if (migration_incoming_in_colo_state()) {
> >>         host = colo_cache_from_block_offset(block, addr, true);
> >>     } else {
> >>         host_bak = colo_cache_from_block_offset(block, addr, false);
> >>     }
> >> }
> >> qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
> >> if (host_bak) {
> >>     memcpy(host_bak, host, TARGET_PAGE_SIZE);
> >> }
> >> 
> >> multifd:
> >> 
> >> if (migrate_colo()) {
> >>     p->host = p->block->colo_cache;
> >> }
> >> 
> >> for (int i = 0; i < p->normal_num; i++) {
> >>     p->iov[i].iov_base = p->host + p->normal[i];
> >> }
> >> return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
> >> 
> >> if (!migration_incoming_in_colo_state()) {
> >>     for (int i = 0; i < p->normal_num; i++) {
> >>         void *guest = p->block->host + p->normal[i];
> >>         void *cache = p->host + p->normal[i];
> >>         memcpy(guest, cache, multifd_ram_page_size());
> >>     }
> >> }
> >> ---
> >> 
> >> > +        for (int i = 0; i < p->zero_num; i++) {
> >> > +            void *guest = p->block->host + p->zero[i];
> >> > +            memset(guest, 0, multifd_ram_page_size());
> >> > +        }  
> >> 
> >> At multifd_nocomp_recv, there will be a call to
> >> multifd_recv_zero_page_process(), which by that point will have p->host
> >> == p->block->colo_cache, so it looks like that function will do some
> >> zero page processing in the colo_cache, setting the rb->receivedmap for
> >> pages in the colo_cache and potentially also doing a memcpy. Is this
> >> intended?
> >
> > rb->receivedmap is only for postcopy, right? So it doesn't apply with
> > colo.
> >
> 
> It's not anymore since commit 5ef7e26bdb ("migration/multifd: solve zero
> page causing multiple page faults"). So it seems we might be doing extra
> work on top of the colo_cache.

IIUC not extra, but exactly what will be needed.

The logic was about "in a vanilla precopy, if we see one page arriving the
1st time we don't need to zero the buffer because the buffer should be zero
allocated".

In COLO's case, COLO always puts RAM data into colo_cache, hence it should
apply to colo_cache too, avoiding unnecessary memset() for colo_cache
instead.

E.g. colo_cache is allocated from qemu_anon_ram_alloc(), it's also
guaranteed to be zeros when never touched.

> 
> >> 
> >> I'm thinking that maybe it would overall be better to hook colo directly
> >> in to multifd_nocomp_recv:
> >
> > But then it will only work for nocomp, right? It feels like the wrong
> > level of abstraction to me.
> >
> 
> Ah, nocomp != ram indeed.
> 
> >> 
> >> static int multifd_nocomp_recv(MultiFDRecvParams *p, Error **errp)
> >> {
> >>     uint32_t flags;
> >> 
> >>     if (migrate_mapped_ram()) {
> >>         return multifd_file_recv_data(p, errp);
> >>     }
> >> 
> >>     flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
> >> 
> >>     if (flags != MULTIFD_FLAG_NOCOMP) {
> >>         error_setg(errp, "multifd %u: flags received %x flags expected %x",
> >>                    p->id, flags, MULTIFD_FLAG_NOCOMP);
> >>         return -1;
> >>     }
> >> 
> >> +    if (migration_incoming_colo_enabled() && migration_incoming_in_colo_state()) {
> >> +        p->host = p->block->colo_cache;
> >> +    } // or else{}, depending on how deal with zero pages in the cache
> >> 
> >>     multifd_recv_zero_page_process(p);
> >> 
> >>     if (!p->normal_num) {
> >>         return 0;
> >>     }
> >> 
> >>     for (int i = 0; i < p->normal_num; i++) {
> >>         p->iov[i].iov_base = p->host + p->normal[i];
> >>         p->iov[i].iov_len = multifd_ram_page_size();
> >>         ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
> >>     }
> >> +    ret = qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
> >> +    if (ret != 0) {
> >> +        return ret;
> >> +    }
> >> +
> >> +    if (migration_incoming_colo_enabled()) {
> >> +        multifd_colo_process_recv();
> >> +    }
> >> 
> >>     return ret;
> >> }
> >> 
> >> 
> >> > +    }
> >> > +}
> >> > diff --git a/migration/multifd-colo.h b/migration/multifd-colo.h
> >> > new file mode 100644
> >> > index 0000000000000000000000000000000000000000..82eaf3f48c47de2f090f9de52f9d57a337d4754a
> >> > --- /dev/null
> >> > +++ b/migration/multifd-colo.h
> >> > @@ -0,0 +1,26 @@
> >> > +/*
> >> > + * SPDX-License-Identifier: GPL-2.0-or-later
> >> > + *
> >> > + * multifd colo header
> >> > + *
> >> > + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
> >> > + *
> >> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> >> > + * See the COPYING file in the top-level directory.
> >> > + */
> >> > +
> >> > +#ifndef QEMU_MIGRATION_MULTIFD_COLO_H
> >> > +#define QEMU_MIGRATION_MULTIFD_COLO_H
> >> > +
> >> > +#ifdef CONFIG_REPLICATION
> >> > +
> >> > +void multifd_colo_prepare_recv(MultiFDRecvParams *p);
> >> > +void multifd_colo_process_recv(MultiFDRecvParams *p);
> >> > +
> >> > +#else
> >> > +
> >> > +static inline void multifd_colo_prepare_recv(MultiFDRecvParams *p) {}
> >> > +static inline void multifd_colo_process_recv(MultiFDRecvParams *p) {}
> >> > +
> >> > +#endif
> >> > +#endif
> >> > diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
> >> > index 9be79b3b8e00371ebff9e112766c225bec260bf7..9f7a792fa761b3bc30b971b35f464103a61787f0 100644
> >> > --- a/migration/multifd-nocomp.c
> >> > +++ b/migration/multifd-nocomp.c
> >> > @@ -16,6 +16,7 @@
> >> >  #include "file.h"
> >> >  #include "migration-stats.h"
> >> >  #include "multifd.h"
> >> > +#include "multifd-colo.h"
> >> >  #include "options.h"
> >> >  #include "migration.h"
> >> >  #include "qapi/error.h"
> >> > @@ -269,7 +270,6 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
> >> >          return -1;
> >> >      }
> >> >  
> >> > -    p->host = p->block->host;
> >> >      for (i = 0; i < p->normal_num; i++) {
> >> >          uint64_t offset = be64_to_cpu(packet->offset[i]);
> >> >  
> >> > @@ -294,6 +294,14 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
> >> >          p->zero[i] = offset;
> >> >      }
> >> >  
> >> > +    if (migrate_colo()) {
> >> > +        multifd_colo_prepare_recv(p);
> >> > +        assert(p->block->colo_cache);
> >> > +        p->host = p->block->colo_cache;  
> >> 
> >> Can't you just use p->block->colo_cache later? I don't see why p->host
> >> needs to be set beforehand even in the non-colo case.
> >
> > We should not touch the guest ram directly while in colo state, since
> > the incoming guest is running and we either want to receive and apply a
> > whole checkpoint with all ram into colo cache and all device state,
> > or if anything goes wrong during checkpointing, keep the currently
> > running guest on the incoming side in pristine state.
> >
> 
> I was asking about setting p->host at this specific point. I don't think
> any of this fits the unfill function. However, I see those were
> suggested by Peter so let's not go back and forth.

Actually I don't know why p->host existed before this work; IIUC we could
have always used p->block->host.  Maybe when Juan was developing this Juan
kept COLO in mind; or maybe Juan wanted to avoid frequent p->block pointer
reference.

IIUC, we could remove p->host, but when we need to access "the buffer of
the ramblock" we'll need to call a helper to fetch that (either ramblock's
buffer, or colo_cache, per migrate_colo()).  And it might be slightly
slower than p->host indeed.

> 
> > I have written more about colo migration here:
> >
> > https://lore.kernel.org/qemu-devel/20260117204913.584e1829@penguin/
> > https://lore.kernel.org/qemu-devel/aXE1i9xJ81EWokYz@x1.local/
> >
> >> 
> >> > +    } else {
> >> > +        p->host = p->block->host;
> >> > +    }
> >> > +
> >> >      return 0;
> >> >  }
> >> >  
> >> > diff --git a/migration/multifd.c b/migration/multifd.c
> >> > index 332e6fc58053462419f3171f6c320ac37648ef7b..220ed8564960fdabc58e4baa069dd252c8ad293c 100644
> >> > --- a/migration/multifd.c
> >> > +++ b/migration/multifd.c
> >> > @@ -29,6 +29,7 @@
> >> >  #include "qemu-file.h"
> >> >  #include "trace.h"
> >> >  #include "multifd.h"
> >> > +#include "multifd-colo.h"
> >> >  #include "options.h"
> >> >  #include "qemu/yank.h"
> >> >  #include "io/channel-file.h"
> >> > @@ -1258,6 +1259,13 @@ static int multifd_ram_state_recv(MultiFDRecvParams *p, Error **errp)
> >> >      int ret;
> >> >  
> >> >      ret = multifd_recv_state->ops->recv(p, errp);
> >> > +    if (ret != 0) {
> >> > +        return ret;
> >> > +    }
> >> > +
> >> > +    if (migrate_colo()) {
> >> > +        multifd_colo_process_recv(p);
> >> > +    }  
> >> 
> >> Either put all of colo hooks in multifd.c or multifd-nocomp.c. I think
> >> the latter is more appropriate as we have mapped_ram already in
> >> there. Let's drop patch 3 and put this in multifd_nocomp_recv().
> >
> > Again, it also should work with compression and multifd_nocomp_recv()
> > is for nocomp only.
> >
> >> 
> >> >  
> >> >      return ret;
> >> >  }
> >> > diff --git a/migration/multifd.h b/migration/multifd.h
> >> > index 89a395aef2b09a6762c45b5361e0ab63256feff6..fbc35702b062fdc3213ce92baed35994f5967c2b 100644
> >> > --- a/migration/multifd.h
> >> > +++ b/migration/multifd.h
> >> > @@ -279,7 +279,10 @@ typedef struct {
> >> >      uint64_t packets_recved;
> >> >      /* ramblock */
> >> >      RAMBlock *block;
> >> > -    /* ramblock host address */
> >> > +    /*
> >> > +     * Normally, it points to ramblock's host address.  When COLO
> >> > +     * is enabled, it points to the mirror cache for the ramblock.
> >> > +     */
> >> >      uint8_t *host;
> >> >      /* buffers to recv */
> >> >      struct iovec *iov;  
> 

-- 
Peter Xu

Re: [PATCH v3 04/10] multifd: Add COLO support

Posted by Fabiano Rosas 1 week, 5 days ago

Peter Xu <peterx@redhat.com> writes:

> On Mon, Jan 26, 2026 at 06:37:31PM -0300, Fabiano Rosas wrote:
>> Lukas Straub <lukasstraub2@web.de> writes:
>> 
>> >> > +void multifd_colo_prepare_recv(MultiFDRecvParams *p)
>> >> > +{
>> >> > +    /*
>> >> > +     * While we're still in precopy state (not yet in colo state), we copy
>> >> > +     * received pages to both guest and cache. No need to set dirty bits,
>> >> > +     * since guest and cache memory are in sync.
>> >> > +     */
>> >> > +    if (migration_incoming_in_colo_state()) {  
>> >> 
>> >> What's the relationship between migration_incoming_in_colo_state() and
>> >> migration_incoming_colo_enabled()? ram_load_precopy() checks both. Would
>> >> migration_incoming_colo_enabled affect multifd as well?
>> >
>> > So first off migration_incoming_colo_enabled() and migrate_colo()
>> > are equivalent in practice since
>> > 121ccedc2b migration: block incoming colo when capability is disabled
>> >
>> > (I have some cleanup patches lying around, but that will be for later)
>> >
>> 
>> Ok, I think those are important because when having multifd and
>> non-multifd code for the same feature, it's useful to be able to compare
>> the two. So some degree of uniformity would be nice.
>
> I second.  We can drop those in this series before adding multifd support,
> likely together with MIG_CMD_ENABLE_COLO as well; I don't think COLO needs
> to worry about old binaries.  It should always use the same QEMU binary on
> both sides.
>
> The patch needs to rename MIG_CMD_ENABLE_COLO to MIG_CMD_DEPRECATED_0 or
> something, to make the rest MIG_CMD compatible to old binaries, though.
>
>> 
>> > For colo, we do normal precopy migration and at the end we go into colo
>> > state and then
>> > migration_incoming_in_colo_state() will be true.
>> >
>> > Here we check migrate_colo() outside of these functions as Peter
>> > requested that in a previous version.
>> >
>> >> 
>> >> The multifd recv threads will be running until after
>> >> process_incoming_migration_bh(), which is when
>> >> migration_incoming_disable_colo() runs.
>> >
>> > That is not an issue as we use migrate_colo() here.
>> >
>> >> 
>> >> Also, is the colo_cache guaranteed to be around until multifd threads
>> >> exit?
>> >
>> > This is an issue. I will fix it in the next version.
>> >
>> >>
>> >> > +        colo_record_bitmap(p->block, p->normal, p->normal_num);
>> >> > +        colo_record_bitmap(p->block, p->zero, p->zero_num);
>> >> > +    }
>> >> > +}
>> >> > +
>> >> > +void multifd_colo_process_recv(MultiFDRecvParams *p)
>> >> > +{
>> >> > +    if (!migration_incoming_in_colo_state()) {
>> >> > +        for (int i = 0; i < p->normal_num; i++) {
>> >> > +            void *guest = p->block->host + p->normal[i];
>> >> > +            void *cache = p->host + p->normal[i];
>> >> > +            memcpy(guest, cache, multifd_ram_page_size());
>> >> > +        }  
>> >> 
>> >> I see some differences between what ram.c does and what multifd will do
>> >> after this patch regarding which flags are checked and order of copies
>> >> (code below):
>> >> 
>> >> ram.c:
>> >> 
>> >>   - migration_incoming_colo_enabled && migration_incoming_in_colo_state:
>> >>   Reads from stream into colo_cache.
>> >>   
>> >>   - migration_incoming_colo_enabled && !migration_incoming_in_colo_state:
>> >>   Reads from stream into guest and then memcpy into colo_cache.
>> >> 
>> >>   - !migration_incoming_colo_enabled
>> >>   Reads from stream into guest.
>> >> 
>> >> multifd.c:
>> >> 
>> >>   - migrate_colo:
>> >>   Reads from stream into colo_cache.
>> >>   
>> >>   - !migration_incoming_in_colo_state:
>> >>   memcpy from colo_cache into guest.
>> >> 
>> >>   - !migration_incoming_colo_enabled
>> >>   ???
>> >> 
>> >> The resulting state should be the same, but I wonder if we want to i) use
>> >> the same checks in multifd
>> >
>> > migration_incoming_colo_enabled() shouldn't even exist anymore, so I'm
>> > not using it here. migrate_colo() is much easier to reason about.
>> >
>> >> and ii) when not in colo state, copy first
>> >> into guest (using readv) and later memcpy into the colo_cache.
>> >
>> > I think it is easier the way it is now.
>> >
>> >> 
>> >> ---
>> >> ram.c:
>> >> 
>> >> host = host_from_ram_block_offset(block, addr);
>> >> if (migration_incoming_colo_enabled()) {
>> >>     if (migration_incoming_in_colo_state()) {
>> >>         host = colo_cache_from_block_offset(block, addr, true);
>> >>     } else {
>> >>         host_bak = colo_cache_from_block_offset(block, addr, false);
>> >>     }
>> >> }
>> >> qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
>> >> if (host_bak) {
>> >>     memcpy(host_bak, host, TARGET_PAGE_SIZE);
>> >> }
>> >> 
>> >> multifd:
>> >> 
>> >> if (migrate_colo()) {
>> >>     p->host = p->block->colo_cache;
>> >> }
>> >> 
>> >> for (int i = 0; i < p->normal_num; i++) {
>> >>     p->iov[i].iov_base = p->host + p->normal[i];
>> >> }
>> >> return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
>> >> 
>> >> if (!migration_incoming_in_colo_state()) {
>> >>     for (int i = 0; i < p->normal_num; i++) {
>> >>         void *guest = p->block->host + p->normal[i];
>> >>         void *cache = p->host + p->normal[i];
>> >>         memcpy(guest, cache, multifd_ram_page_size());
>> >>     }
>> >> }
>> >> ---
>> >> 
>> >> > +        for (int i = 0; i < p->zero_num; i++) {
>> >> > +            void *guest = p->block->host + p->zero[i];
>> >> > +            memset(guest, 0, multifd_ram_page_size());
>> >> > +        }  
>> >> 
>> >> At multifd_nocomp_recv, there will be a call to
>> >> multifd_recv_zero_page_process(), which by that point will have p->host
>> >> == p->block->colo_cache, so it looks like that function will do some
>> >> zero page processing in the colo_cache, setting the rb->receivedmap for
>> >> pages in the colo_cache and potentially also doing a memcpy. Is this
>> >> intended?
>> >
>> > rb->receivedmap is only for postcopy, right? So it doesn't apply with
>> > colo.
>> >
>> 
>> It's not anymore since commit 5ef7e26bdb ("migration/multifd: solve zero
>> page causing multiple page faults"). So it seems we might be doing extra
>> work on top of the colo_cache.
>
> IIUC not extra, but exactly what will be needed.
>
> The logic was about "in a vanilla precopy, if we see one page arriving the
> 1st time we don't need to zero the buffer because the buffer should be zero
> allocated".
>
> In COLO's case, COLO always puts RAM data into colo_cache, hence it should
> apply to colo_cache too, avoiding unnecessary memset() for colo_cache
> instead.
>
> E.g. colo_cache is allocated from qemu_anon_ram_alloc(), it's also
> guaranteed to be zeros when never touched.
>
>> 
>> >> 
>> >> I'm thinking that maybe it would overall be better to hook colo directly
>> >> in to multifd_nocomp_recv:
>> >
>> > But then it will only work for nocomp, right? It feels like the wrong
>> > level of abstraction to me.
>> >
>> 
>> Ah, nocomp != ram indeed.
>> 
>> >> 
>> >> static int multifd_nocomp_recv(MultiFDRecvParams *p, Error **errp)
>> >> {
>> >>     uint32_t flags;
>> >> 
>> >>     if (migrate_mapped_ram()) {
>> >>         return multifd_file_recv_data(p, errp);
>> >>     }
>> >> 
>> >>     flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
>> >> 
>> >>     if (flags != MULTIFD_FLAG_NOCOMP) {
>> >>         error_setg(errp, "multifd %u: flags received %x flags expected %x",
>> >>                    p->id, flags, MULTIFD_FLAG_NOCOMP);
>> >>         return -1;
>> >>     }
>> >> 
>> >> +    if (migration_incoming_colo_enabled() && migration_incoming_in_colo_state()) {
>> >> +        p->host = p->block->colo_cache;
>> >> +    } // or else{}, depending on how deal with zero pages in the cache
>> >> 
>> >>     multifd_recv_zero_page_process(p);
>> >> 
>> >>     if (!p->normal_num) {
>> >>         return 0;
>> >>     }
>> >> 
>> >>     for (int i = 0; i < p->normal_num; i++) {
>> >>         p->iov[i].iov_base = p->host + p->normal[i];
>> >>         p->iov[i].iov_len = multifd_ram_page_size();
>> >>         ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
>> >>     }
>> >> +    ret = qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
>> >> +    if (ret != 0) {
>> >> +        return ret;
>> >> +    }
>> >> +
>> >> +    if (migration_incoming_colo_enabled()) {
>> >> +        multifd_colo_process_recv();
>> >> +    }
>> >> 
>> >>     return ret;
>> >> }
>> >> 
>> >> 
>> >> > +    }
>> >> > +}
>> >> > diff --git a/migration/multifd-colo.h b/migration/multifd-colo.h
>> >> > new file mode 100644
>> >> > index 0000000000000000000000000000000000000000..82eaf3f48c47de2f090f9de52f9d57a337d4754a
>> >> > --- /dev/null
>> >> > +++ b/migration/multifd-colo.h
>> >> > @@ -0,0 +1,26 @@
>> >> > +/*
>> >> > + * SPDX-License-Identifier: GPL-2.0-or-later
>> >> > + *
>> >> > + * multifd colo header
>> >> > + *
>> >> > + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
>> >> > + *
>> >> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> >> > + * See the COPYING file in the top-level directory.
>> >> > + */
>> >> > +
>> >> > +#ifndef QEMU_MIGRATION_MULTIFD_COLO_H
>> >> > +#define QEMU_MIGRATION_MULTIFD_COLO_H
>> >> > +
>> >> > +#ifdef CONFIG_REPLICATION
>> >> > +
>> >> > +void multifd_colo_prepare_recv(MultiFDRecvParams *p);
>> >> > +void multifd_colo_process_recv(MultiFDRecvParams *p);
>> >> > +
>> >> > +#else
>> >> > +
>> >> > +static inline void multifd_colo_prepare_recv(MultiFDRecvParams *p) {}
>> >> > +static inline void multifd_colo_process_recv(MultiFDRecvParams *p) {}
>> >> > +
>> >> > +#endif
>> >> > +#endif
>> >> > diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
>> >> > index 9be79b3b8e00371ebff9e112766c225bec260bf7..9f7a792fa761b3bc30b971b35f464103a61787f0 100644
>> >> > --- a/migration/multifd-nocomp.c
>> >> > +++ b/migration/multifd-nocomp.c
>> >> > @@ -16,6 +16,7 @@
>> >> >  #include "file.h"
>> >> >  #include "migration-stats.h"
>> >> >  #include "multifd.h"
>> >> > +#include "multifd-colo.h"
>> >> >  #include "options.h"
>> >> >  #include "migration.h"
>> >> >  #include "qapi/error.h"
>> >> > @@ -269,7 +270,6 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
>> >> >          return -1;
>> >> >      }
>> >> >  
>> >> > -    p->host = p->block->host;
>> >> >      for (i = 0; i < p->normal_num; i++) {
>> >> >          uint64_t offset = be64_to_cpu(packet->offset[i]);
>> >> >  
>> >> > @@ -294,6 +294,14 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
>> >> >          p->zero[i] = offset;
>> >> >      }
>> >> >  
>> >> > +    if (migrate_colo()) {
>> >> > +        multifd_colo_prepare_recv(p);
>> >> > +        assert(p->block->colo_cache);
>> >> > +        p->host = p->block->colo_cache;  
>> >> 
>> >> Can't you just use p->block->colo_cache later? I don't see why p->host
>> >> needs to be set beforehand even in the non-colo case.
>> >
>> > We should not touch the guest ram directly while in colo state, since
>> > the incoming guest is running and we either want to receive and apply a
>> > whole checkpoint with all ram into colo cache and all device state,
>> > or if anything goes wrong during checkpointing, keep the currently
>> > running guest on the incoming side in pristine state.
>> >
>> 
>> I was asking about setting p->host at this specific point. I don't think
>> any of this fits the unfill function. However, I see those were
>> suggested by Peter so let's not go back and forth.
>
> Actually I don't know why p->host existed before this work; IIUC we could
> have always used p->block->host.  Maybe when Juan was developing this Juan
> kept COLO in mind; or maybe Juan wanted to avoid frequent p->block pointer
> reference.
>

Maybe p->block was being reset at some point and p->host was passed
being the point where the (whatever) lock was release. I checked and
today there's no such thing. The p->mutex seems to be there just to
protect against this in multifd_recv_sync_main:

WITH_QEMU_LOCK_GUARD(&p->mutex) {
    if (multifd_recv_state->packet_num < p->packet_num) {
        multifd_recv_state->packet_num = p->packet_num;
    }
}

> IIUC, we could remove p->host, but when we need to access "the buffer of
> the ramblock" we'll need to call a helper to fetch that (either ramblock's
> buffer, or colo_cache, per migrate_colo()).  And it might be slightly
> slower than p->host indeed.
>

Yeah, let's keep it, the compression code also uses it, there's no point
removing it now.

>> 
>> > I have written more about colo migration here:
>> >
>> > https://lore.kernel.org/qemu-devel/20260117204913.584e1829@penguin/
>> > https://lore.kernel.org/qemu-devel/aXE1i9xJ81EWokYz@x1.local/
>> >
>> >> 
>> >> > +    } else {
>> >> > +        p->host = p->block->host;
>> >> > +    }
>> >> > +
>> >> >      return 0;
>> >> >  }
>> >> >  
>> >> > diff --git a/migration/multifd.c b/migration/multifd.c
>> >> > index 332e6fc58053462419f3171f6c320ac37648ef7b..220ed8564960fdabc58e4baa069dd252c8ad293c 100644
>> >> > --- a/migration/multifd.c
>> >> > +++ b/migration/multifd.c
>> >> > @@ -29,6 +29,7 @@
>> >> >  #include "qemu-file.h"
>> >> >  #include "trace.h"
>> >> >  #include "multifd.h"
>> >> > +#include "multifd-colo.h"
>> >> >  #include "options.h"
>> >> >  #include "qemu/yank.h"
>> >> >  #include "io/channel-file.h"
>> >> > @@ -1258,6 +1259,13 @@ static int multifd_ram_state_recv(MultiFDRecvParams *p, Error **errp)
>> >> >      int ret;
>> >> >  
>> >> >      ret = multifd_recv_state->ops->recv(p, errp);
>> >> > +    if (ret != 0) {
>> >> > +        return ret;
>> >> > +    }
>> >> > +
>> >> > +    if (migrate_colo()) {
>> >> > +        multifd_colo_process_recv(p);
>> >> > +    }  
>> >> 
>> >> Either put all of colo hooks in multifd.c or multifd-nocomp.c. I think
>> >> the latter is more appropriate as we have mapped_ram already in
>> >> there. Let's drop patch 3 and put this in multifd_nocomp_recv().
>> >
>> > Again, it also should work with compression and multifd_nocomp_recv()
>> > is for nocomp only.
>> >
>> >> 
>> >> >  
>> >> >      return ret;
>> >> >  }
>> >> > diff --git a/migration/multifd.h b/migration/multifd.h
>> >> > index 89a395aef2b09a6762c45b5361e0ab63256feff6..fbc35702b062fdc3213ce92baed35994f5967c2b 100644
>> >> > --- a/migration/multifd.h
>> >> > +++ b/migration/multifd.h
>> >> > @@ -279,7 +279,10 @@ typedef struct {
>> >> >      uint64_t packets_recved;
>> >> >      /* ramblock */
>> >> >      RAMBlock *block;
>> >> > -    /* ramblock host address */
>> >> > +    /*
>> >> > +     * Normally, it points to ramblock's host address.  When COLO
>> >> > +     * is enabled, it points to the mirror cache for the ramblock.
>> >> > +     */
>> >> >      uint8_t *host;
>> >> >      /* buffers to recv */
>> >> >      struct iovec *iov;  
>>

Re: [PATCH v3 04/10] multifd: Add COLO support

Posted by Lukas Straub 6 days, 12 hours ago

On Wed, 28 Jan 2026 09:30:24 -0300
Fabiano Rosas <farosas@suse.de> wrote:

> Peter Xu <peterx@redhat.com> writes:
> 
> > On Mon, Jan 26, 2026 at 06:37:31PM -0300, Fabiano Rosas wrote:  
> >> Lukas Straub <lukasstraub2@web.de> writes:
> >>   
> >> >> [...]
> >> >>   
> >> >> > +        for (int i = 0; i < p->zero_num; i++) {
> >> >> > +            void *guest = p->block->host + p->zero[i];
> >> >> > +            memset(guest, 0, multifd_ram_page_size());
> >> >> > +        }    
> >> >> 
> >> >> At multifd_nocomp_recv, there will be a call to
> >> >> multifd_recv_zero_page_process(), which by that point will have p->host
> >> >> == p->block->colo_cache, so it looks like that function will do some
> >> >> zero page processing in the colo_cache, setting the rb->receivedmap for
> >> >> pages in the colo_cache and potentially also doing a memcpy. Is this
> >> >> intended?  
> >> >
> >> > rb->receivedmap is only for postcopy, right? So it doesn't apply with
> >> > colo.
> >> >  
> >> 
> >> It's not anymore since commit 5ef7e26bdb ("migration/multifd: solve zero
> >> page causing multiple page faults"). So it seems we might be doing extra
> >> work on top of the colo_cache.  
> >
> > IIUC not extra, but exactly what will be needed.
> >
> > The logic was about "in a vanilla precopy, if we see one page arriving the
> > 1st time we don't need to zero the buffer because the buffer should be zero
> > allocated".
> >
> > In COLO's case, COLO always puts RAM data into colo_cache, hence it should
> > apply to colo_cache too, avoiding unnecessary memset() for colo_cache
> > instead.
> >
> > E.g. colo_cache is allocated from qemu_anon_ram_alloc(), it's also
> > guaranteed to be zeros when never touched.
> >  
> >>   
> >> >> 
> >> >> I'm thinking that maybe it would overall be better to hook colo directly
> >> >> in to multifd_nocomp_recv:  
> >> >
> >> > But then it will only work for nocomp, right? It feels like the wrong
> >> > level of abstraction to me.
> >> >  
> >> 
> >> Ah, nocomp != ram indeed.
> >>   
> >> >> 
> >> >> > [...]
> >> >> > diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
> >> >> > index 9be79b3b8e00371ebff9e112766c225bec260bf7..9f7a792fa761b3bc30b971b35f464103a61787f0 100644
> >> >> > --- a/migration/multifd-nocomp.c
> >> >> > +++ b/migration/multifd-nocomp.c
> >> >> > @@ -16,6 +16,7 @@
> >> >> >  #include "file.h"
> >> >> >  #include "migration-stats.h"
> >> >> >  #include "multifd.h"
> >> >> > +#include "multifd-colo.h"
> >> >> >  #include "options.h"
> >> >> >  #include "migration.h"
> >> >> >  #include "qapi/error.h"
> >> >> > @@ -269,7 +270,6 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
> >> >> >          return -1;
> >> >> >      }
> >> >> >  
> >> >> > -    p->host = p->block->host;
> >> >> >      for (i = 0; i < p->normal_num; i++) {
> >> >> >          uint64_t offset = be64_to_cpu(packet->offset[i]);
> >> >> >  
> >> >> > @@ -294,6 +294,14 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
> >> >> >          p->zero[i] = offset;
> >> >> >      }
> >> >> >  
> >> >> > +    if (migrate_colo()) {
> >> >> > +        multifd_colo_prepare_recv(p);
> >> >> > +        assert(p->block->colo_cache);
> >> >> > +        p->host = p->block->colo_cache;    
> >> >> 
> >> >> Can't you just use p->block->colo_cache later? I don't see why p->host
> >> >> needs to be set beforehand even in the non-colo case.  
> >> >
> >> > We should not touch the guest ram directly while in colo state, since
> >> > the incoming guest is running and we either want to receive and apply a
> >> > whole checkpoint with all ram into colo cache and all device state,
> >> > or if anything goes wrong during checkpointing, keep the currently
> >> > running guest on the incoming side in pristine state.
> >> >  
> >> 
> >> I was asking about setting p->host at this specific point. I don't think
> >> any of this fits the unfill function. However, I see those were
> >> suggested by Peter so let's not go back and forth.  
> >
> > Actually I don't know why p->host existed before this work; IIUC we could
> > have always used p->block->host.  Maybe when Juan was developing this Juan
> > kept COLO in mind; or maybe Juan wanted to avoid frequent p->block pointer
> > reference.
> >  
> 
> Maybe p->block was being reset at some point and p->host was passed
> being the point where the (whatever) lock was release. I checked and
> today there's no such thing. The p->mutex seems to be there just to
> protect against this in multifd_recv_sync_main:
> 
> WITH_QEMU_LOCK_GUARD(&p->mutex) {
>     if (multifd_recv_state->packet_num < p->packet_num) {
>         multifd_recv_state->packet_num = p->packet_num;
>     }
> }
> 
> > IIUC, we could remove p->host, but when we need to access "the buffer of
> > the ramblock" we'll need to call a helper to fetch that (either ramblock's
> > buffer, or colo_cache, per migrate_colo()).  And it might be slightly
> > slower than p->host indeed.
> >  
> 
> Yeah, let's keep it, the compression code also uses it, there's no point
> removing it now.
> 

Actually p->host was there first p->block was added later for COLO in
5d1d1fcf4 multifd: Add the ramblock to MultiFDRecvParams

Re: [PATCH v3 04/10] multifd: Add COLO support

Posted by Peter Xu 1 week, 5 days ago

On Wed, Jan 28, 2026 at 09:30:24AM -0300, Fabiano Rosas wrote:
> >> >> > @@ -294,6 +294,14 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
> >> >> >          p->zero[i] = offset;
> >> >> >      }
> >> >> >  
> >> >> > +    if (migrate_colo()) {
> >> >> > +        multifd_colo_prepare_recv(p);
> >> >> > +        assert(p->block->colo_cache);
> >> >> > +        p->host = p->block->colo_cache;  
> >> >> 
> >> >> Can't you just use p->block->colo_cache later? I don't see why p->host
> >> >> needs to be set beforehand even in the non-colo case.
> >> >
> >> > We should not touch the guest ram directly while in colo state, since
> >> > the incoming guest is running and we either want to receive and apply a
> >> > whole checkpoint with all ram into colo cache and all device state,
> >> > or if anything goes wrong during checkpointing, keep the currently
> >> > running guest on the incoming side in pristine state.
> >> >
> >> 
> >> I was asking about setting p->host at this specific point. I don't think
> >> any of this fits the unfill function. However, I see those were
> >> suggested by Peter so let's not go back and forth.
> >
> > Actually I don't know why p->host existed before this work; IIUC we could
> > have always used p->block->host.  Maybe when Juan was developing this Juan
> > kept COLO in mind; or maybe Juan wanted to avoid frequent p->block pointer
> > reference.
> >
> 
> Maybe p->block was being reset at some point and p->host was passed
> being the point where the (whatever) lock was release. I checked and
> today there's no such thing. The p->mutex seems to be there just to
> protect against this in multifd_recv_sync_main:
> 
> WITH_QEMU_LOCK_GUARD(&p->mutex) {
>     if (multifd_recv_state->packet_num < p->packet_num) {
>         multifd_recv_state->packet_num = p->packet_num;
>     }
> }

It should be protected by various checks over migration_is_running().

E.g., QMP device-add & device-del are forbidden so no new pc-dimm hotplug /
removal allowed.  Similarly, virtio_mem_is_busy() would return true during
migration too.

We should definitely make sure ramblock will not be reset during the whole
lifecycle of migration; I believe we're not ready for that..

-- 
Peter Xu

Re: [PATCH v3 04/10] multifd: Add COLO support

Posted by Fabiano Rosas 1 week, 5 days ago

Peter Xu <peterx@redhat.com> writes:

> On Wed, Jan 28, 2026 at 09:30:24AM -0300, Fabiano Rosas wrote:
>> >> >> > @@ -294,6 +294,14 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
>> >> >> >          p->zero[i] = offset;
>> >> >> >      }
>> >> >> >  
>> >> >> > +    if (migrate_colo()) {
>> >> >> > +        multifd_colo_prepare_recv(p);
>> >> >> > +        assert(p->block->colo_cache);
>> >> >> > +        p->host = p->block->colo_cache;  
>> >> >> 
>> >> >> Can't you just use p->block->colo_cache later? I don't see why p->host
>> >> >> needs to be set beforehand even in the non-colo case.
>> >> >
>> >> > We should not touch the guest ram directly while in colo state, since
>> >> > the incoming guest is running and we either want to receive and apply a
>> >> > whole checkpoint with all ram into colo cache and all device state,
>> >> > or if anything goes wrong during checkpointing, keep the currently
>> >> > running guest on the incoming side in pristine state.
>> >> >
>> >> 
>> >> I was asking about setting p->host at this specific point. I don't think
>> >> any of this fits the unfill function. However, I see those were
>> >> suggested by Peter so let's not go back and forth.
>> >
>> > Actually I don't know why p->host existed before this work; IIUC we could
>> > have always used p->block->host.  Maybe when Juan was developing this Juan
>> > kept COLO in mind; or maybe Juan wanted to avoid frequent p->block pointer
>> > reference.
>> >
>> 
>> Maybe p->block was being reset at some point and p->host was passed
>> being the point where the (whatever) lock was release. I checked and
>> today there's no such thing. The p->mutex seems to be there just to
>> protect against this in multifd_recv_sync_main:
>> 
>> WITH_QEMU_LOCK_GUARD(&p->mutex) {
>>     if (multifd_recv_state->packet_num < p->packet_num) {
>>         multifd_recv_state->packet_num = p->packet_num;
>>     }
>> }
>
> It should be protected by various checks over migration_is_running().
>
> E.g., QMP device-add & device-del are forbidden so no new pc-dimm hotplug /
> removal allowed.  Similarly, virtio_mem_is_busy() would return true during
> migration too.
>
> We should definitely make sure ramblock will not be reset during the whole
> lifecycle of migration; I believe we're not ready for that..

The pointer reset, not the block. Anyway, it doesn't happen.

Re: [PATCH v3 04/10] multifd: Add COLO support

Posted by Zhang Chen 2 weeks ago

On Mon, Jan 26, 2026 at 4:40 AM Lukas Straub <lukasstraub2@web.de> wrote:
>
> Like in the normal ram_load() path, put the received pages into the
> colo cache and mark the pages in the bitmap so that they will be
> flushed to the guest later.
>
> Multifd with COLO is useful to reduce the VM pause time during checkpointing
> for latency sensitive workloads. In such workloads the worst-case latency
> is especially important.
>
> Also, this is already worth it for the precopy phase as it helps with
> converging. Moreover, multifd migration is the preferred way to do migration
> nowadays and this allows to use multifd compression with COLO.
>
> Benchmark:
> Cluster nodes
>  - Intel Xenon E5-2630 v3
>  - 48Gb RAM
>  - 10G Ethernet
> Guest
>  - Windows Server 2016
>  - 6Gb RAM
>  - 4 cores
> Workload
>  - Upload a file to the guest with SMB to simulate moderate
>    memory dirtying
>  - Measure the memory transfer time portion of each checkpoint
>  - 600ms COLO checkpoint interval
>
> Results
> Plain
>  idle mean: 4.50ms 99per: 10.33ms
>  load mean: 24.30ms 99per: 78.05ms
> Multifd-4
>  idle mean: 6.48ms 99per: 10.41ms
>  load mean: 14.12ms 99per: 31.27ms
>
> Evaluation
> While multifd has slightly higher latency when the guest idles, it is
> 10ms faster under load and more importantly it's worst case latency is
> less than 1/2 of plain under load as can be seen in the 99. Percentile.
>

Why the multifd get higher latency when the guest idles?  The status same
with normal live migration? Where is the time spent? The Sorry, I
don't know this background yet.

Thanks
Chen

> Signed-off-by: Juan Quintela <quintela@redhat.com>
> Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> ---
>  MAINTAINERS                |  1 +
>  migration/meson.build      |  2 +-
>  migration/multifd-colo.c   | 50 ++++++++++++++++++++++++++++++++++++++++++++++
>  migration/multifd-colo.h   | 26 ++++++++++++++++++++++++
>  migration/multifd-nocomp.c | 10 +++++++++-
>  migration/multifd.c        |  8 ++++++++
>  migration/multifd.h        |  5 ++++-
>  7 files changed, 99 insertions(+), 3 deletions(-)
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1e9bdd87c3a2f84f3abfc56986cd793976810fdd..883f0a8f4eb92d0bf0f89fcab4674ccc4aed1cc1 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3853,6 +3853,7 @@ COLO Framework
>  M: Lukas Straub <lukasstraub2@web.de>
>  S: Maintained
>  F: migration/colo*
> +F: migration/multifd-colo.*
>  F: include/migration/colo.h
>  F: include/migration/failover.h
>  F: docs/COLO-FT.txt
> diff --git a/migration/meson.build b/migration/meson.build
> index c7f39bdb55239ecb0e775c77b90a1aa9e6a4a9ce..c9f0f5f9f2137536497e53e960ce70654ad1b394 100644
> --- a/migration/meson.build
> +++ b/migration/meson.build
> @@ -39,7 +39,7 @@ system_ss.add(files(
>  ), gnutls, zlib)
>
>  if get_option('replication').allowed()
> -  system_ss.add(files('colo-failover.c', 'colo.c'))
> +  system_ss.add(files('colo-failover.c', 'colo.c', 'multifd-colo.c'))
>  else
>    system_ss.add(files('colo-stubs.c'))
>  endif
> diff --git a/migration/multifd-colo.c b/migration/multifd-colo.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..c47f5044663969e0c9af56da5ec34902d635810a
> --- /dev/null
> +++ b/migration/multifd-colo.c
> @@ -0,0 +1,50 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * multifd colo implementation
> + *
> + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "exec/target_page.h"
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
> +#include "ram.h"
> +#include "multifd.h"
> +#include "options.h"
> +#include "io/channel-socket.h"
> +#include "migration/colo.h"
> +#include "multifd-colo.h"
> +#include "system/ramblock.h"
> +
> +void multifd_colo_prepare_recv(MultiFDRecvParams *p)
> +{
> +    /*
> +     * While we're still in precopy state (not yet in colo state), we copy
> +     * received pages to both guest and cache. No need to set dirty bits,
> +     * since guest and cache memory are in sync.
> +     */
> +    if (migration_incoming_in_colo_state()) {
> +        colo_record_bitmap(p->block, p->normal, p->normal_num);
> +        colo_record_bitmap(p->block, p->zero, p->zero_num);
> +    }
> +}
> +
> +void multifd_colo_process_recv(MultiFDRecvParams *p)
> +{
> +    if (!migration_incoming_in_colo_state()) {
> +        for (int i = 0; i < p->normal_num; i++) {
> +            void *guest = p->block->host + p->normal[i];
> +            void *cache = p->host + p->normal[i];
> +            memcpy(guest, cache, multifd_ram_page_size());
> +        }
> +        for (int i = 0; i < p->zero_num; i++) {
> +            void *guest = p->block->host + p->zero[i];
> +            memset(guest, 0, multifd_ram_page_size());
> +        }
> +    }
> +}
> diff --git a/migration/multifd-colo.h b/migration/multifd-colo.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..82eaf3f48c47de2f090f9de52f9d57a337d4754a
> --- /dev/null
> +++ b/migration/multifd-colo.h
> @@ -0,0 +1,26 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * multifd colo header
> + *
> + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#ifndef QEMU_MIGRATION_MULTIFD_COLO_H
> +#define QEMU_MIGRATION_MULTIFD_COLO_H
> +
> +#ifdef CONFIG_REPLICATION
> +
> +void multifd_colo_prepare_recv(MultiFDRecvParams *p);
> +void multifd_colo_process_recv(MultiFDRecvParams *p);
> +
> +#else
> +
> +static inline void multifd_colo_prepare_recv(MultiFDRecvParams *p) {}
> +static inline void multifd_colo_process_recv(MultiFDRecvParams *p) {}
> +
> +#endif
> +#endif
> diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
> index 9be79b3b8e00371ebff9e112766c225bec260bf7..9f7a792fa761b3bc30b971b35f464103a61787f0 100644
> --- a/migration/multifd-nocomp.c
> +++ b/migration/multifd-nocomp.c
> @@ -16,6 +16,7 @@
>  #include "file.h"
>  #include "migration-stats.h"
>  #include "multifd.h"
> +#include "multifd-colo.h"
>  #include "options.h"
>  #include "migration.h"
>  #include "qapi/error.h"
> @@ -269,7 +270,6 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
>          return -1;
>      }
>
> -    p->host = p->block->host;
>      for (i = 0; i < p->normal_num; i++) {
>          uint64_t offset = be64_to_cpu(packet->offset[i]);
>
> @@ -294,6 +294,14 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
>          p->zero[i] = offset;
>      }
>
> +    if (migrate_colo()) {
> +        multifd_colo_prepare_recv(p);
> +        assert(p->block->colo_cache);
> +        p->host = p->block->colo_cache;
> +    } else {
> +        p->host = p->block->host;
> +    }
> +
>      return 0;
>  }
>
> diff --git a/migration/multifd.c b/migration/multifd.c
> index 332e6fc58053462419f3171f6c320ac37648ef7b..220ed8564960fdabc58e4baa069dd252c8ad293c 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -29,6 +29,7 @@
>  #include "qemu-file.h"
>  #include "trace.h"
>  #include "multifd.h"
> +#include "multifd-colo.h"
>  #include "options.h"
>  #include "qemu/yank.h"
>  #include "io/channel-file.h"
> @@ -1258,6 +1259,13 @@ static int multifd_ram_state_recv(MultiFDRecvParams *p, Error **errp)
>      int ret;
>
>      ret = multifd_recv_state->ops->recv(p, errp);
> +    if (ret != 0) {
> +        return ret;
> +    }
> +
> +    if (migrate_colo()) {
> +        multifd_colo_process_recv(p);
> +    }
>
>      return ret;
>  }
> diff --git a/migration/multifd.h b/migration/multifd.h
> index 89a395aef2b09a6762c45b5361e0ab63256feff6..fbc35702b062fdc3213ce92baed35994f5967c2b 100644
> --- a/migration/multifd.h
> +++ b/migration/multifd.h
> @@ -279,7 +279,10 @@ typedef struct {
>      uint64_t packets_recved;
>      /* ramblock */
>      RAMBlock *block;
> -    /* ramblock host address */
> +    /*
> +     * Normally, it points to ramblock's host address.  When COLO
> +     * is enabled, it points to the mirror cache for the ramblock.
> +     */
>      uint8_t *host;
>      /* buffers to recv */
>      struct iovec *iov;
>
> --
> 2.39.5
>

Re: [PATCH v3 04/10] multifd: Add COLO support

Posted by Lukas Straub 2 weeks ago

On Mon, 26 Jan 2026 18:36:39 +0800
Zhang Chen <zhangckid@gmail.com> wrote:

> On Mon, Jan 26, 2026 at 4:40 AM Lukas Straub <lukasstraub2@web.de> wrote:
> >
> > Like in the normal ram_load() path, put the received pages into the
> > colo cache and mark the pages in the bitmap so that they will be
> > flushed to the guest later.
> >
> > Multifd with COLO is useful to reduce the VM pause time during checkpointing
> > for latency sensitive workloads. In such workloads the worst-case latency
> > is especially important.
> >
> > Also, this is already worth it for the precopy phase as it helps with
> > converging. Moreover, multifd migration is the preferred way to do migration
> > nowadays and this allows to use multifd compression with COLO.
> >
> > Benchmark:
> > Cluster nodes
> >  - Intel Xenon E5-2630 v3
> >  - 48Gb RAM
> >  - 10G Ethernet
> > Guest
> >  - Windows Server 2016
> >  - 6Gb RAM
> >  - 4 cores
> > Workload
> >  - Upload a file to the guest with SMB to simulate moderate
> >    memory dirtying
> >  - Measure the memory transfer time portion of each checkpoint
> >  - 600ms COLO checkpoint interval
> >
> > Results
> > Plain
> >  idle mean: 4.50ms 99per: 10.33ms
> >  load mean: 24.30ms 99per: 78.05ms
> > Multifd-4
> >  idle mean: 6.48ms 99per: 10.41ms
> >  load mean: 14.12ms 99per: 31.27ms
> >
> > Evaluation
> > While multifd has slightly higher latency when the guest idles, it is
> > 10ms faster under load and more importantly it's worst case latency is
> > less than 1/2 of plain under load as can be seen in the 99. Percentile.
> >  
> 
> Why the multifd get higher latency when the guest idles?  The status same
> with normal live migration? Where is the time spent? The Sorry, I
> don't know this background yet.

Not sure, it could be more overhead due to coordinating the multifd
threads.

But it also can be explained from the sample variation. Here I also
calculate the standard deviation of the sample.
60% of samples are within +- one stddev.

plain idle: mean 4.50 99per 10.33 stddev 1.80
plain load: mean 24.30 99per 78.05 stddev 13.65
multifd-4 idle: mean 6.48 99per 10.41 stddev 2.53
multifd-4 load: mean 14.12 99per 31.27 stddev 7.48

So, I don't think its a significant difference.

> 
> Thanks
> Chen
> 
> > Signed-off-by: Juan Quintela <quintela@redhat.com>
> > Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> > ---
> >  MAINTAINERS                |  1 +
> >  migration/meson.build      |  2 +-
> >  migration/multifd-colo.c   | 50 ++++++++++++++++++++++++++++++++++++++++++++++
> >  migration/multifd-colo.h   | 26 ++++++++++++++++++++++++
> >  migration/multifd-nocomp.c | 10 +++++++++-
> >  migration/multifd.c        |  8 ++++++++
> >  migration/multifd.h        |  5 ++++-
> >  7 files changed, 99 insertions(+), 3 deletions(-)
> >
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index 1e9bdd87c3a2f84f3abfc56986cd793976810fdd..883f0a8f4eb92d0bf0f89fcab4674ccc4aed1cc1 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -3853,6 +3853,7 @@ COLO Framework
> >  M: Lukas Straub <lukasstraub2@web.de>
> >  S: Maintained
> >  F: migration/colo*
> > +F: migration/multifd-colo.*
> >  F: include/migration/colo.h
> >  F: include/migration/failover.h
> >  F: docs/COLO-FT.txt
> > diff --git a/migration/meson.build b/migration/meson.build
> > index c7f39bdb55239ecb0e775c77b90a1aa9e6a4a9ce..c9f0f5f9f2137536497e53e960ce70654ad1b394 100644
> > --- a/migration/meson.build
> > +++ b/migration/meson.build
> > @@ -39,7 +39,7 @@ system_ss.add(files(
> >  ), gnutls, zlib)
> >
> >  if get_option('replication').allowed()
> > -  system_ss.add(files('colo-failover.c', 'colo.c'))
> > +  system_ss.add(files('colo-failover.c', 'colo.c', 'multifd-colo.c'))
> >  else
> >    system_ss.add(files('colo-stubs.c'))
> >  endif
> > diff --git a/migration/multifd-colo.c b/migration/multifd-colo.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..c47f5044663969e0c9af56da5ec34902d635810a
> > --- /dev/null
> > +++ b/migration/multifd-colo.c
> > @@ -0,0 +1,50 @@
> > +/*
> > + * SPDX-License-Identifier: GPL-2.0-or-later
> > + *
> > + * multifd colo implementation
> > + *
> > + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "exec/target_page.h"
> > +#include "qemu/error-report.h"
> > +#include "qapi/error.h"
> > +#include "ram.h"
> > +#include "multifd.h"
> > +#include "options.h"
> > +#include "io/channel-socket.h"
> > +#include "migration/colo.h"
> > +#include "multifd-colo.h"
> > +#include "system/ramblock.h"
> > +
> > +void multifd_colo_prepare_recv(MultiFDRecvParams *p)
> > +{
> > +    /*
> > +     * While we're still in precopy state (not yet in colo state), we copy
> > +     * received pages to both guest and cache. No need to set dirty bits,
> > +     * since guest and cache memory are in sync.
> > +     */
> > +    if (migration_incoming_in_colo_state()) {
> > +        colo_record_bitmap(p->block, p->normal, p->normal_num);
> > +        colo_record_bitmap(p->block, p->zero, p->zero_num);
> > +    }
> > +}
> > +
> > +void multifd_colo_process_recv(MultiFDRecvParams *p)
> > +{
> > +    if (!migration_incoming_in_colo_state()) {
> > +        for (int i = 0; i < p->normal_num; i++) {
> > +            void *guest = p->block->host + p->normal[i];
> > +            void *cache = p->host + p->normal[i];
> > +            memcpy(guest, cache, multifd_ram_page_size());
> > +        }
> > +        for (int i = 0; i < p->zero_num; i++) {
> > +            void *guest = p->block->host + p->zero[i];
> > +            memset(guest, 0, multifd_ram_page_size());
> > +        }
> > +    }
> > +}
> > diff --git a/migration/multifd-colo.h b/migration/multifd-colo.h
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..82eaf3f48c47de2f090f9de52f9d57a337d4754a
> > --- /dev/null
> > +++ b/migration/multifd-colo.h
> > @@ -0,0 +1,26 @@
> > +/*
> > + * SPDX-License-Identifier: GPL-2.0-or-later
> > + *
> > + * multifd colo header
> > + *
> > + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +
> > +#ifndef QEMU_MIGRATION_MULTIFD_COLO_H
> > +#define QEMU_MIGRATION_MULTIFD_COLO_H
> > +
> > +#ifdef CONFIG_REPLICATION
> > +
> > +void multifd_colo_prepare_recv(MultiFDRecvParams *p);
> > +void multifd_colo_process_recv(MultiFDRecvParams *p);
> > +
> > +#else
> > +
> > +static inline void multifd_colo_prepare_recv(MultiFDRecvParams *p) {}
> > +static inline void multifd_colo_process_recv(MultiFDRecvParams *p) {}
> > +
> > +#endif
> > +#endif
> > diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
> > index 9be79b3b8e00371ebff9e112766c225bec260bf7..9f7a792fa761b3bc30b971b35f464103a61787f0 100644
> > --- a/migration/multifd-nocomp.c
> > +++ b/migration/multifd-nocomp.c
> > @@ -16,6 +16,7 @@
> >  #include "file.h"
> >  #include "migration-stats.h"
> >  #include "multifd.h"
> > +#include "multifd-colo.h"
> >  #include "options.h"
> >  #include "migration.h"
> >  #include "qapi/error.h"
> > @@ -269,7 +270,6 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
> >          return -1;
> >      }
> >
> > -    p->host = p->block->host;
> >      for (i = 0; i < p->normal_num; i++) {
> >          uint64_t offset = be64_to_cpu(packet->offset[i]);
> >
> > @@ -294,6 +294,14 @@ int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
> >          p->zero[i] = offset;
> >      }
> >
> > +    if (migrate_colo()) {
> > +        multifd_colo_prepare_recv(p);
> > +        assert(p->block->colo_cache);
> > +        p->host = p->block->colo_cache;
> > +    } else {
> > +        p->host = p->block->host;
> > +    }
> > +
> >      return 0;
> >  }
> >
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index 332e6fc58053462419f3171f6c320ac37648ef7b..220ed8564960fdabc58e4baa069dd252c8ad293c 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -29,6 +29,7 @@
> >  #include "qemu-file.h"
> >  #include "trace.h"
> >  #include "multifd.h"
> > +#include "multifd-colo.h"
> >  #include "options.h"
> >  #include "qemu/yank.h"
> >  #include "io/channel-file.h"
> > @@ -1258,6 +1259,13 @@ static int multifd_ram_state_recv(MultiFDRecvParams *p, Error **errp)
> >      int ret;
> >
> >      ret = multifd_recv_state->ops->recv(p, errp);
> > +    if (ret != 0) {
> > +        return ret;
> > +    }
> > +
> > +    if (migrate_colo()) {
> > +        multifd_colo_process_recv(p);
> > +    }
> >
> >      return ret;
> >  }
> > diff --git a/migration/multifd.h b/migration/multifd.h
> > index 89a395aef2b09a6762c45b5361e0ab63256feff6..fbc35702b062fdc3213ce92baed35994f5967c2b 100644
> > --- a/migration/multifd.h
> > +++ b/migration/multifd.h
> > @@ -279,7 +279,10 @@ typedef struct {
> >      uint64_t packets_recved;
> >      /* ramblock */
> >      RAMBlock *block;
> > -    /* ramblock host address */
> > +    /*
> > +     * Normally, it points to ramblock's host address.  When COLO
> > +     * is enabled, it points to the mirror cache for the ramblock.
> > +     */
> >      uint8_t *host;
> >      /* buffers to recv */
> >      struct iovec *iov;
> >
> > --
> > 2.39.5
> >