migration/ram.c | 16 +++++++++++++--- migration/ram.h | 2 +- migration/rdma.c | 9 ++++++++- 3 files changed, 22 insertions(+), 5 deletions(-)
When the destination receives a zero page during precopy migration,
ram_handle_zero() calls buffer_is_zero() which reads the page. For
anonymous mmap this is benign (reads map to the shared zero page), but
for memory-backend-memfd (mmap(MAP_SHARED) of a memfd) even a read
commits a physical page in the tmpfs page cache.
As a result, after migration all zero pages of the guest are committed
on the destination, turning a sparse RSS into a fully-populated one
(see GitLab issue #2839: a 256 GB VM went from ~4 GB RSS before
migration to ~256 GB after).
Add a bool can_discard parameter and call madvise(MADV_DONTNEED) when
it is true. This releases tmpfs/anonymous pages back to the kernel's
zero-page pool without reading the mapping at all. The madvise is
issued before any read or write, preventing the initial page fault
entirely.
Callers pass can_discard = !(block->flags & RAM_PREALLOC) so that
backends with prealloc=on are unaffected: deliberately pre-faulted pages
must not be discarded. On the destination side vCPUs are paused
(RUN_STATE_INMIGRATE) while precopy pages are loaded, so madvise is
race-free.
After migration for VM with 4GB RAM, the RSS on destination was reduced
to 247 MB (vs 4148 MB before change), measured via VmRSS in
/proc/$PID/status.
Relates-to: https://wiki.qemu.org/ToDo/LiveMigration#Avoid_page_population_when_page_is_not_populated
See-also: https://gitlab.com/qemu-project/qemu/-/issues/2839
Signed-off-by: Trieu Huynh <vikingtc4@gmail.com>
---
migration/ram.c | 16 +++++++++++++---
migration/ram.h | 2 +-
migration/rdma.c | 9 ++++++++-
3 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/migration/ram.c b/migration/ram.c
index 2a7e958e87..e57613e29d 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -3638,9 +3638,13 @@ static inline void *colo_cache_from_block_offset(RAMBlock *block,
*
* @host: host address for the zero page
* @size: size of the zero page
+ * @can_discard: check whether RAMBlock was created with prealloc=on
*/
-void ram_handle_zero(void *host, uint64_t size)
+void ram_handle_zero(void *host, uint64_t size, bool can_discard)
{
+ if (can_discard && qemu_madvise(host, size, QEMU_MADV_DONTNEED) == 0) {
+ return;
+ }
if (!buffer_is_zero(host, size)) {
memset(host, 0, size);
}
@@ -4086,7 +4090,7 @@ static bool handle_zero_mapped_ram(RAMBlock *block, unsigned long from_bit_idx,
block->idstr);
return false;
}
- ram_handle_zero(host, size);
+ ram_handle_zero(host, size, !(block->flags & RAM_PREALLOC));
return true;
}
@@ -4421,7 +4425,13 @@ static int ram_load_precopy(QEMUFile *f)
ret = -EINVAL;
break;
}
- ram_handle_zero(host, TARGET_PAGE_SIZE);
+ {
+ ram_addr_t ram_offset;
+ RAMBlock *rb = qemu_ram_block_from_host(host, false,
+ &ram_offset);
+ bool can_discard = rb && !(rb->flags & RAM_PREALLOC);
+ ram_handle_zero(host, TARGET_PAGE_SIZE, can_discard);
+ }
break;
case RAM_SAVE_FLAG_PAGE:
diff --git a/migration/ram.h b/migration/ram.h
index 41697a7599..faa80f27d1 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -90,7 +90,7 @@ int ram_discard_range(const char *block_name, uint64_t start, size_t length);
int ram_postcopy_incoming_init(MigrationIncomingState *mis, Error **errp);
int ram_load_postcopy(QEMUFile *f, int channel);
-void ram_handle_zero(void *host, uint64_t size);
+void ram_handle_zero(void *host, uint64_t size, bool can_discard);
void ram_transferred_add(uint64_t bytes);
void ram_release_page(const char *rbname, uint64_t offset);
diff --git a/migration/rdma.c b/migration/rdma.c
index 55ab85650a..d4c36af5b9 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -28,6 +28,7 @@
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
#include "qemu/module.h"
+#include "system/ramblock.h"
#include "qemu/rcu.h"
#include "qemu/sockets.h"
#include "qemu/bitmap.h"
@@ -3413,7 +3414,13 @@ int rdma_registration_handle(QEMUFile *f)
comp->value);
goto err;
}
- ram_handle_zero(host_addr, comp->length);
+ {
+ ram_addr_t ram_offset;
+ RAMBlock *rb = qemu_ram_block_from_host(host_addr, false,
+ &ram_offset);
+ bool can_discard = rb && !(rb->flags & RAM_PREALLOC);
+ ram_handle_zero(host_addr, comp->length, can_discard);
+ }
break;
case RDMA_CONTROL_REGISTER_FINISHED:
--
2.43.0
© 2016 - 2026 Red Hat, Inc.