From: Chuang Xu <xuchuangxclwt@bytedance.com>
In our long-term experience in Bytedance, we've found that under
the same load, live migration of larger VMs with more devices is
often more difficult to converge (requiring a larger downtime limit).
Through some testing and calculations, we conclude that bitmap sync time
affects the calculation of live migration bandwidth.
When the addresses processed are not aligned, a large number of
clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate
2048 clear_dirty ioctls from two different memory_listener),
which increases the time required for bitmap_sync and makes it
more difficult for dirty pages to converge.
For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and
16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms*
(tested with 10GBps dirty rate, the sync time increases as the dirty
page rate increases), Here are each part of the sync time:
- sync from kvm to ram_list: 2.5ms
- vhost_log_sync:3ms
- sync aligned memory from ram_list to RAMBlock: 5ms
- sync misaligned memory from ram_list to RAMBlock: 61ms
Attempt to merge those fragmented clear_dirty ioctls, then syncing
misaligned memory from ram_list to RAMBlock takes only about 1ms,
and the total sync time is only *12ms*.
Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com
[peterx: drop var "offset" in physical_memory_sync_dirty_bitmap]
Signed-off-by: Peter Xu <peterx@redhat.com>
---
include/system/physmem.h | 7 +++---
accel/tcg/cputlb.c | 5 +++--
migration/ram.c | 19 +++++-----------
system/memory.c | 2 +-
system/physmem.c | 48 +++++++++++++++++++++++++++-------------
5 files changed, 46 insertions(+), 35 deletions(-)
diff --git a/include/system/physmem.h b/include/system/physmem.h
index 879f6eae38..a59724ef10 100644
--- a/include/system/physmem.h
+++ b/include/system/physmem.h
@@ -39,9 +39,10 @@ uint64_t physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
void physical_memory_dirty_bits_cleared(ram_addr_t start, ram_addr_t length);
-bool physical_memory_test_and_clear_dirty(ram_addr_t start,
- ram_addr_t length,
- unsigned client);
+uint64_t physical_memory_test_and_clear_dirty(ram_addr_t start,
+ ram_addr_t length,
+ unsigned client,
+ unsigned long *bmap);
DirtyBitmapSnapshot *
physical_memory_snapshot_and_clear_dirty(MemoryRegion *mr, hwaddr offset,
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index fd1606c856..c8827c8b0d 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -857,8 +857,9 @@ void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
void tlb_protect_code(ram_addr_t ram_addr)
{
physical_memory_test_and_clear_dirty(ram_addr & TARGET_PAGE_MASK,
- TARGET_PAGE_SIZE,
- DIRTY_MEMORY_CODE);
+ TARGET_PAGE_SIZE,
+ DIRTY_MEMORY_CODE,
+ NULL);
}
/* update the TLB so that writes in physical page 'phys_addr' are no longer
diff --git a/migration/ram.c b/migration/ram.c
index df7e154877..c403fd73a6 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -941,7 +941,6 @@ static uint64_t physical_memory_sync_dirty_bitmap(RAMBlock *rb,
ram_addr_t start,
ram_addr_t length)
{
- ram_addr_t addr;
unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS);
uint64_t num_dirty = 0;
unsigned long *dest = rb->bmap;
@@ -993,19 +992,11 @@ static uint64_t physical_memory_sync_dirty_bitmap(RAMBlock *rb,
memory_region_clear_dirty_bitmap(rb->mr, start, length);
}
} else {
- ram_addr_t offset = rb->offset;
-
- for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) {
- if (physical_memory_test_and_clear_dirty(
- start + addr + offset,
- TARGET_PAGE_SIZE,
- DIRTY_MEMORY_MIGRATION)) {
- long k = (start + addr) >> TARGET_PAGE_BITS;
- if (!test_and_set_bit(k, dest)) {
- num_dirty++;
- }
- }
- }
+ num_dirty = physical_memory_test_and_clear_dirty(
+ start + rb->offset,
+ length,
+ DIRTY_MEMORY_MIGRATION,
+ dest);
}
return num_dirty;
diff --git a/system/memory.c b/system/memory.c
index 8b84661ae3..666364392d 100644
--- a/system/memory.c
+++ b/system/memory.c
@@ -2424,7 +2424,7 @@ void memory_region_reset_dirty(MemoryRegion *mr, hwaddr addr,
{
assert(mr->ram_block);
physical_memory_test_and_clear_dirty(
- memory_region_get_ram_addr(mr) + addr, size, client);
+ memory_region_get_ram_addr(mr) + addr, size, client, NULL);
}
int memory_region_get_fd(MemoryRegion *mr)
diff --git a/system/physmem.c b/system/physmem.c
index c9869e4049..26bf30af17 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1089,19 +1089,30 @@ void physical_memory_set_dirty_range(ram_addr_t start, ram_addr_t length,
}
}
-/* Note: start and end must be within the same ram block. */
-bool physical_memory_test_and_clear_dirty(ram_addr_t start,
+/*
+ * Note: start and end must be within the same ram block.
+ *
+ * @bmap usage:
+ * - When @bmap is provided, set bits for dirty pages, but
+ * only count those pages if the bit wasn't already set in @bmap.
+ * - When @bmap is NULL, count all dirty pages in the range.
+ *
+ * @return:
+ * - Number of dirty guest pages found within [start, start + length).
+ */
+uint64_t physical_memory_test_and_clear_dirty(ram_addr_t start,
ram_addr_t length,
- unsigned client)
+ unsigned client,
+ unsigned long *bmap)
{
DirtyMemoryBlocks *blocks;
unsigned long end, page, start_page;
- bool dirty = false;
+ uint64_t num_dirty = 0;
RAMBlock *ramblock;
uint64_t mr_offset, mr_size;
if (length == 0) {
- return false;
+ return 0;
}
end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
@@ -1118,12 +1129,19 @@ bool physical_memory_test_and_clear_dirty(ram_addr_t start,
while (page < end) {
unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
- unsigned long num = MIN(end - page,
- DIRTY_MEMORY_BLOCK_SIZE - offset);
- dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
- offset, num);
- page += num;
+ if (bitmap_test_and_clear_atomic(blocks->blocks[idx], offset, 1)) {
+ if (bmap) {
+ unsigned long k = page - (ramblock->offset >> TARGET_PAGE_BITS);
+ if (!test_and_set_bit(k, bmap)) {
+ num_dirty++;
+ }
+ } else {
+ num_dirty++;
+ }
+ }
+
+ page++;
}
mr_offset = (ram_addr_t)(start_page << TARGET_PAGE_BITS) - ramblock->offset;
@@ -1131,18 +1149,18 @@ bool physical_memory_test_and_clear_dirty(ram_addr_t start,
memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
}
- if (dirty) {
+ if (num_dirty) {
physical_memory_dirty_bits_cleared(start, length);
}
- return dirty;
+ return num_dirty;
}
static void physical_memory_clear_dirty_range(ram_addr_t addr, ram_addr_t length)
{
- physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_MIGRATION);
- physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_VGA);
- physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_CODE);
+ physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_MIGRATION, NULL);
+ physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_VGA, NULL);
+ physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_CODE, NULL);
}
DirtyBitmapSnapshot *physical_memory_snapshot_and_clear_dirty
--
2.50.1