:p
atchew
Login
Currently, a huge VM with high memory overload may take a long time to increase its maximum throttle percentage. The root cause is that the current auto-converge throttle logic doesn't look like it will scale because migration_trigger_throttle() is only called for each iteration, so it won't be invoked for a long time if one iteration can take a long time. This patchset provides two refinements aiming at the above case. 1: The periodic CPU throttle. As Peter points out, "throttle only for each sync, sync for each iteration" may make sense in the old days, but perhaps not anymore. So we introduce perioidic CPU throttle implementation for migration, which is a trade-off between synchronization overhead and CPU throttle impact. 2: The responsive CPU throttle. We present new criteria called "dirty ratio" to help improve the detection accuracy and hence accelerate the throttle's invocation. The RFC version of the refinement may be a rudimentary implementation, I would appreciate hearing more feedback. Yong, thanks. Hyman Huang (10): migration: Introduce structs for periodic CPU throttle migration: Refine util functions to support periodic CPU throttle qapi/migration: Introduce periodic CPU throttling parameters qapi/migration: Introduce the iteration-count migration: Introduce util functions for periodic CPU throttle migration: Support periodic CPU throttle tests/migration-tests: Add test case for periodic throttle migration: Introduce cpu-responsive-throttle parameter migration: Support responsive CPU throttle tests/migration-tests: Add test case for responsive CPU throttle include/exec/ram_addr.h | 117 ++++++++++++++++-- include/exec/ramblock.h | 45 +++++++ migration/migration-hmp-cmds.c | 25 ++++ migration/migration-stats.h | 4 + migration/migration.c | 12 ++ migration/options.c | 74 +++++++++++ migration/options.h | 3 + migration/ram.c | 218 ++++++++++++++++++++++++++++++--- migration/ram.h | 4 + migration/trace-events | 4 + qapi/migration.json | 45 ++++++- tests/qtest/migration-test.c | 77 +++++++++++- 12 files changed, 600 insertions(+), 28 deletions(-) -- 2.39.1
shadow_bmap, iter_bmap, iter_dirty_pages and periodic_sync_shown_up are introduced to satisfy the need for periodic CPU throttle. Meanwhile, introduce enumeration of dirty bitmap sync method. Signed-off-by: Hyman Huang <yong.huang@smartx.com> --- include/exec/ramblock.h | 45 +++++++++++++++++++++++++++++++++++++++++ migration/ram.c | 6 ++++++ 2 files changed, 51 insertions(+) diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h index XXXXXXX..XXXXXXX 100644 --- a/include/exec/ramblock.h +++ b/include/exec/ramblock.h @@ -XXX,XX +XXX,XX @@ #include "qemu/rcu.h" #include "exec/ramlist.h" +/* Possible bits for migration_bitmap_sync */ + +/* + * The old-fashioned sync method, which is, in turn, used for CPU + * throttle and memory transfer. + */ +#define RAMBLOCK_SYN_LEGACY_ITER (1U << 0) + +/* + * The modern sync method, which is, in turn, used for CPU throttle + * and memory transfer. + */ +#define RAMBLOCK_SYN_MODERN_ITER (1U << 1) + +/* The modern sync method, which is used for CPU throttle only */ +#define RAMBLOCK_SYN_MODERN_PERIOD (1U << 2) + +#define RAMBLOCK_SYN_MASK (0x7) + +typedef enum RAMBlockSynMode { + RAMBLOCK_SYN_LEGACY, /* Old-fashined mode */ + RAMBLOCK_SYN_MODERN, +} RAMBlockSynMode; + struct RAMBlock { struct rcu_head rcu; struct MemoryRegion *mr; @@ -XXX,XX +XXX,XX @@ struct RAMBlock { * could not have been valid on the source. */ ram_addr_t postcopy_length; + + /* + * Used to backup the bmap during periodic sync to see whether any dirty + * pages were sent during that time. + */ + unsigned long *shadow_bmap; + + /* + * The bitmap "bmap," which was initially used for both sync and memory + * transfer, will be replaced by two bitmaps: the previously used "bmap" + * and the recently added "iter_bmap." Only the memory transfer is + * conducted with the previously used "bmap"; the recently added + * "iter_bmap" is utilized for sync. + */ + unsigned long *iter_bmap; + + /* Number of new dirty pages during iteration */ + uint64_t iter_dirty_pages; + + /* If periodic sync has shown up during iteration */ + bool periodic_sync_shown_up; }; #endif #endif diff --git a/migration/ram.c b/migration/ram.c index XXXXXXX..XXXXXXX 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -XXX,XX +XXX,XX @@ static void ram_bitmaps_destroy(void) block->bmap = NULL; g_free(block->file_bmap); block->file_bmap = NULL; + g_free(block->shadow_bmap); + block->shadow_bmap = NULL; + g_free(block->iter_bmap); + block->iter_bmap = NULL; } } @@ -XXX,XX +XXX,XX @@ static void ram_list_init_bitmaps(void) } block->clear_bmap_shift = shift; block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); + block->shadow_bmap = bitmap_new(pages); + block->iter_bmap = bitmap_new(pages); } } } -- 2.39.1
Supply the migration_bitmap_sync function along with a periodic argument. Introduce the sync_mode global variable to track the sync mode and support periodic throttling while keeping backward compatibility. Signed-off-by: Hyman Huang <yong.huang@smartx.com> --- include/exec/ram_addr.h | 117 ++++++++++++++++++++++++++++++++++++---- migration/ram.c | 49 +++++++++++++---- 2 files changed, 147 insertions(+), 19 deletions(-) diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index XXXXXXX..XXXXXXX 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -XXX,XX +XXX,XX @@ static inline void cpu_physical_memory_clear_dirty_range(ram_addr_t start, cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_CODE); } +static void ramblock_clear_iter_bmap(RAMBlock *rb, + ram_addr_t start, + ram_addr_t length) +{ + ram_addr_t addr; + unsigned long *bmap = rb->bmap; + unsigned long *shadow_bmap = rb->shadow_bmap; + unsigned long *iter_bmap = rb->iter_bmap; + + for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) { + long k = (start + addr) >> TARGET_PAGE_BITS; + if (test_bit(k, shadow_bmap) && !test_bit(k, bmap)) { + /* Page has been sent, clear the iter bmap */ + clear_bit(k, iter_bmap); + } + } +} + +static void ramblock_update_iter_bmap(RAMBlock *rb, + ram_addr_t start, + ram_addr_t length) +{ + ram_addr_t addr; + unsigned long *bmap = rb->bmap; + unsigned long *iter_bmap = rb->iter_bmap; + + for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) { + long k = (start + addr) >> TARGET_PAGE_BITS; + if (test_bit(k, iter_bmap)) { + if (!test_bit(k, bmap)) { + set_bit(k, bmap); + rb->iter_dirty_pages++; + } + } + } +} /* Called with RCU critical section */ static inline uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb, ram_addr_t start, - ram_addr_t length) + ram_addr_t length, + unsigned int flag) { ram_addr_t addr; unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS); uint64_t num_dirty = 0; unsigned long *dest = rb->bmap; + unsigned long *shadow_bmap = rb->shadow_bmap; + unsigned long *iter_bmap = rb->iter_bmap; + + assert(flag && !(flag & (~RAMBLOCK_SYN_MASK))); + + /* + * We must remove the sent dirty page from the iter_bmap in order to + * minimize redundant page transfers if periodic sync has appeared + * during this iteration. + */ + if (rb->periodic_sync_shown_up && + (flag & (RAMBLOCK_SYN_MODERN_ITER | RAMBLOCK_SYN_MODERN_PERIOD))) { + ramblock_clear_iter_bmap(rb, start, length); + } /* start address and length is aligned at the start of a word? */ if (((word * BITS_PER_LONG) << TARGET_PAGE_BITS) == @@ -XXX,XX +XXX,XX @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb, if (src[idx][offset]) { unsigned long bits = qatomic_xchg(&src[idx][offset], 0); unsigned long new_dirty; + if (flag & (RAMBLOCK_SYN_MODERN_ITER | + RAMBLOCK_SYN_MODERN_PERIOD)) { + /* Back-up bmap for the next iteration */ + iter_bmap[k] |= bits; + if (flag == RAMBLOCK_SYN_MODERN_PERIOD) { + /* Back-up bmap to detect pages has been sent */ + shadow_bmap[k] = dest[k]; + } + } new_dirty = ~dest[k]; - dest[k] |= bits; + if (flag == RAMBLOCK_SYN_LEGACY_ITER) { + dest[k] |= bits; + } + new_dirty &= bits; num_dirty += ctpopl(new_dirty); } @@ -XXX,XX +XXX,XX @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb, ram_addr_t offset = rb->offset; for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) { - if (cpu_physical_memory_test_and_clear_dirty( - start + addr + offset, - TARGET_PAGE_SIZE, - DIRTY_MEMORY_MIGRATION)) { - long k = (start + addr) >> TARGET_PAGE_BITS; - if (!test_and_set_bit(k, dest)) { - num_dirty++; + long k = (start + addr) >> TARGET_PAGE_BITS; + if (flag == RAMBLOCK_SYN_MODERN_PERIOD) { + if (test_bit(k, dest)) { + /* Back-up bmap to detect pages has been sent */ + set_bit(k, shadow_bmap); + } + } + + if (flag == RAMBLOCK_SYN_LEGACY_ITER) { + if (cpu_physical_memory_test_and_clear_dirty( + start + addr + offset, + TARGET_PAGE_SIZE, + DIRTY_MEMORY_MIGRATION)) { + if (!test_and_set_bit(k, dest)) { + num_dirty++; + } + } + } else { + if (cpu_physical_memory_test_and_clear_dirty( + start + addr + offset, + TARGET_PAGE_SIZE, + DIRTY_MEMORY_MIGRATION)) { + if (!test_bit(k, dest)) { + num_dirty++; + } + /* Back-up bmap for the next iteration */ + set_bit(k, iter_bmap); } } } } + /* + * If periodic sync has emerged, we have to resync every dirty + * page from the iter_bmap one by one. It's possible that not + * all of the dirty pages that this iteration is meant to send + * are included in the bitmap that the current sync retrieved + * from the KVM. + */ + if (rb->periodic_sync_shown_up && + (flag == RAMBLOCK_SYN_MODERN_ITER)) { + ramblock_update_iter_bmap(rb, start, length); + } + + if (flag == RAMBLOCK_SYN_MODERN_PERIOD) { + rb->periodic_sync_shown_up = true; + } + return num_dirty; } #endif diff --git a/migration/ram.c b/migration/ram.c index XXXXXXX..XXXXXXX 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -XXX,XX +XXX,XX @@ XBZRLECacheStats xbzrle_counters; +static RAMBlockSynMode sync_mode = RAMBLOCK_SYN_LEGACY; + /* used by the search for pages to send */ struct PageSearchStatus { /* The migration channel used for a specific host page */ @@ -XXX,XX +XXX,XX @@ bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) return false; } +static void ramblock_reset_iter_stats(RAMBlock *rb) +{ + bitmap_clear(rb->shadow_bmap, 0, rb->used_length >> TARGET_PAGE_BITS); + bitmap_clear(rb->iter_bmap, 0, rb->used_length >> TARGET_PAGE_BITS); + rb->iter_dirty_pages = 0; + rb->periodic_sync_shown_up = false; +} + /* Called with RCU critical section */ -static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) +static void ramblock_sync_dirty_bitmap(RAMState *rs, + RAMBlock *rb, + bool periodic) { - uint64_t new_dirty_pages = - cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); + uint64_t new_dirty_pages; + unsigned int flag = RAMBLOCK_SYN_LEGACY_ITER; + + if (sync_mode == RAMBLOCK_SYN_MODERN) { + flag = periodic ? RAMBLOCK_SYN_MODERN_PERIOD : RAMBLOCK_SYN_MODERN_ITER; + } + + new_dirty_pages = + cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length, flag); + + if (flag & (RAMBLOCK_SYN_LEGACY_ITER | RAMBLOCK_SYN_MODERN_ITER)) { + if (flag == RAMBLOCK_SYN_LEGACY_ITER) { + rs->migration_dirty_pages += new_dirty_pages; + } else { + rs->migration_dirty_pages += rb->iter_dirty_pages; + ramblock_reset_iter_stats(rb); + } + } - rs->migration_dirty_pages += new_dirty_pages; rs->num_dirty_pages_period += new_dirty_pages; } @@ -XXX,XX +XXX,XX @@ static void migration_trigger_throttle(RAMState *rs) } } -static void migration_bitmap_sync(RAMState *rs, bool last_stage) +static void migration_bitmap_sync(RAMState *rs, + bool last_stage, + bool periodic) { RAMBlock *block; int64_t end_time; @@ -XXX,XX +XXX,XX @@ static void migration_bitmap_sync(RAMState *rs, bool last_stage) WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) { WITH_RCU_READ_LOCK_GUARD() { RAMBLOCK_FOREACH_NOT_IGNORED(block) { - ramblock_sync_dirty_bitmap(rs, block); + ramblock_sync_dirty_bitmap(rs, block, periodic); } stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining()); } @@ -XXX,XX +XXX,XX @@ static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage) local_err = NULL; } - migration_bitmap_sync(rs, last_stage); + migration_bitmap_sync(rs, last_stage, false); if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { error_report_err(local_err); @@ -XXX,XX +XXX,XX @@ void ram_postcopy_send_discard_bitmap(MigrationState *ms) RCU_READ_LOCK_GUARD(); /* This should be our last sync, the src is now paused */ - migration_bitmap_sync(rs, false); + migration_bitmap_sync(rs, false, false); /* Easiest way to make sure we don't resume in the middle of a host-page */ rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; @@ -XXX,XX +XXX,XX @@ void colo_incoming_start_dirty_log(void) memory_global_dirty_log_sync(false); WITH_RCU_READ_LOCK_GUARD() { RAMBLOCK_FOREACH_NOT_IGNORED(block) { - ramblock_sync_dirty_bitmap(ram_state, block); + ramblock_sync_dirty_bitmap(ram_state, block, false); /* Discard this dirty bitmap record */ bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); } @@ -XXX,XX +XXX,XX @@ void colo_flush_ram_cache(void) qemu_mutex_lock(&ram_state->bitmap_mutex); WITH_RCU_READ_LOCK_GUARD() { RAMBLOCK_FOREACH_NOT_IGNORED(block) { - ramblock_sync_dirty_bitmap(ram_state, block); + ramblock_sync_dirty_bitmap(ram_state, block, false); } } -- 2.39.1
To activate the periodic CPU throttleing feature, introduce the cpu-periodic-throttle. To control the frequency of throttling, introduce the cpu-periodic-throttle-interval. Signed-off-by: Hyman Huang <yong.huang@smartx.com> --- migration/migration-hmp-cmds.c | 17 +++++++++++ migration/options.c | 54 ++++++++++++++++++++++++++++++++++ migration/options.h | 2 ++ qapi/migration.json | 25 +++++++++++++++- 4 files changed, 97 insertions(+), 1 deletion(-) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index XXXXXXX..XXXXXXX 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -XXX,XX +XXX,XX @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %s\n", MigrationParameter_str(MIGRATION_PARAMETER_CPU_THROTTLE_TAILSLOW), params->cpu_throttle_tailslow ? "on" : "off"); + assert(params->has_cpu_periodic_throttle); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_CPU_PERIODIC_THROTTLE), + params->cpu_periodic_throttle ? "on" : "off"); + assert(params->has_cpu_periodic_throttle_interval); + monitor_printf(mon, "%s: %u\n", + MigrationParameter_str( + MIGRATION_PARAMETER_CPU_PERIODIC_THROTTLE_INTERVAL), + params->cpu_periodic_throttle_interval); assert(params->has_max_cpu_throttle); monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_MAX_CPU_THROTTLE), @@ -XXX,XX +XXX,XX @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_cpu_throttle_tailslow = true; visit_type_bool(v, param, &p->cpu_throttle_tailslow, &err); break; + case MIGRATION_PARAMETER_CPU_PERIODIC_THROTTLE: + p->has_cpu_periodic_throttle = true; + visit_type_bool(v, param, &p->cpu_periodic_throttle, &err); + break; + case MIGRATION_PARAMETER_CPU_PERIODIC_THROTTLE_INTERVAL: + p->has_cpu_periodic_throttle_interval = true; + visit_type_uint8(v, param, &p->cpu_periodic_throttle_interval, &err); + break; case MIGRATION_PARAMETER_MAX_CPU_THROTTLE: p->has_max_cpu_throttle = true; visit_type_uint8(v, param, &p->max_cpu_throttle, &err); diff --git a/migration/options.c b/migration/options.c index XXXXXXX..XXXXXXX 100644 --- a/migration/options.c +++ b/migration/options.c @@ -XXX,XX +XXX,XX @@ #define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50 #define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20 #define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10 +#define DEFAULT_MIGRATE_CPU_PERIODIC_THROTTLE_INTERVAL 5 #define DEFAULT_MIGRATE_MAX_CPU_THROTTLE 99 /* Migration XBZRLE default cache size */ @@ -XXX,XX +XXX,XX @@ Property migration_properties[] = { DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT), DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState, parameters.cpu_throttle_tailslow, false), + DEFINE_PROP_BOOL("x-cpu-periodic-throttle", MigrationState, + parameters.cpu_periodic_throttle, false), + DEFINE_PROP_UINT8("x-cpu-periodic-throttle-interval", MigrationState, + parameters.cpu_periodic_throttle_interval, + DEFAULT_MIGRATE_CPU_PERIODIC_THROTTLE_INTERVAL), DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState, parameters.max_bandwidth, MAX_THROTTLE), DEFINE_PROP_SIZE("avail-switchover-bandwidth", MigrationState, @@ -XXX,XX +XXX,XX @@ uint8_t migrate_cpu_throttle_initial(void) return s->parameters.cpu_throttle_initial; } +uint8_t migrate_periodic_throttle_interval(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.cpu_periodic_throttle_interval; +} + +bool migrate_periodic_throttle(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.cpu_periodic_throttle; +} + bool migrate_cpu_throttle_tailslow(void) { MigrationState *s = migrate_get_current(); @@ -XXX,XX +XXX,XX @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->cpu_throttle_increment = s->parameters.cpu_throttle_increment; params->has_cpu_throttle_tailslow = true; params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow; + params->has_cpu_periodic_throttle = true; + params->cpu_periodic_throttle = s->parameters.cpu_periodic_throttle; + params->has_cpu_periodic_throttle_interval = true; + params->cpu_periodic_throttle_interval = + s->parameters.cpu_periodic_throttle_interval; params->tls_creds = g_strdup(s->parameters.tls_creds); params->tls_hostname = g_strdup(s->parameters.tls_hostname); params->tls_authz = g_strdup(s->parameters.tls_authz ? @@ -XXX,XX +XXX,XX @@ void migrate_params_init(MigrationParameters *params) params->has_cpu_throttle_initial = true; params->has_cpu_throttle_increment = true; params->has_cpu_throttle_tailslow = true; + params->has_cpu_periodic_throttle = true; + params->has_cpu_periodic_throttle_interval = true; params->has_max_bandwidth = true; params->has_downtime_limit = true; params->has_x_checkpoint_delay = true; @@ -XXX,XX +XXX,XX @@ bool migrate_params_check(MigrationParameters *params, Error **errp) return false; } + if (params->has_cpu_periodic_throttle_interval && + (params->cpu_periodic_throttle_interval < 2 || + params->cpu_periodic_throttle_interval > 10)) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, + "cpu_periodic_throttle_interval", + "an integer in the range of 2 to 10"); + return false; + } + if (params->has_max_bandwidth && (params->max_bandwidth > SIZE_MAX)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "max_bandwidth", @@ -XXX,XX +XXX,XX @@ static void migrate_params_test_apply(MigrateSetParameters *params, dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow; } + if (params->has_cpu_periodic_throttle) { + dest->cpu_periodic_throttle = params->cpu_periodic_throttle; + } + + if (params->has_cpu_periodic_throttle_interval) { + dest->cpu_periodic_throttle_interval = + params->cpu_periodic_throttle_interval; + } + if (params->tls_creds) { assert(params->tls_creds->type == QTYPE_QSTRING); dest->tls_creds = params->tls_creds->u.s; @@ -XXX,XX +XXX,XX @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow; } + if (params->has_cpu_periodic_throttle) { + s->parameters.cpu_periodic_throttle = params->cpu_periodic_throttle; + } + + if (params->has_cpu_periodic_throttle_interval) { + s->parameters.cpu_periodic_throttle_interval = + params->cpu_periodic_throttle_interval; + } + if (params->tls_creds) { g_free(s->parameters.tls_creds); assert(params->tls_creds->type == QTYPE_QSTRING); diff --git a/migration/options.h b/migration/options.h index XXXXXXX..XXXXXXX 100644 --- a/migration/options.h +++ b/migration/options.h @@ -XXX,XX +XXX,XX @@ bool migrate_has_block_bitmap_mapping(void); uint32_t migrate_checkpoint_delay(void); uint8_t migrate_cpu_throttle_increment(void); uint8_t migrate_cpu_throttle_initial(void); +uint8_t migrate_periodic_throttle_interval(void); +bool migrate_periodic_throttle(void); bool migrate_cpu_throttle_tailslow(void); bool migrate_direct_io(void); uint64_t migrate_downtime_limit(void); diff --git a/qapi/migration.json b/qapi/migration.json index XXXXXXX..XXXXXXX 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -XXX,XX +XXX,XX @@ # be excessive at tail stage. The default value is false. (Since # 5.1) # +# @cpu-periodic-throttle: Make CPU throttling periodically. +# (Since 9.1) +# +# @cpu-periodic-throttle-interval: Interval of the periodic CPU throttling. +# (Since 9.1) +# # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data # channel. On the outgoing side of the migration, the credentials @@ -XXX,XX +XXX,XX @@ 'announce-rounds', 'announce-step', 'throttle-trigger-threshold', 'cpu-throttle-initial', 'cpu-throttle-increment', - 'cpu-throttle-tailslow', + 'cpu-throttle-tailslow', 'cpu-periodic-throttle', + 'cpu-periodic-throttle-interval', 'tls-creds', 'tls-hostname', 'tls-authz', 'max-bandwidth', 'avail-switchover-bandwidth', 'downtime-limit', { 'name': 'x-checkpoint-delay', 'features': [ 'unstable' ] }, @@ -XXX,XX +XXX,XX @@ # be excessive at tail stage. The default value is false. (Since # 5.1) # +# @cpu-periodic-throttle: Make CPU throttling periodically. +# (Since 9.1) +# +# @cpu-periodic-throttle-interval: Interval of the periodic CPU throttling. +# (Since 9.1) +# # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data # channel. On the outgoing side of the migration, the credentials @@ -XXX,XX +XXX,XX @@ '*cpu-throttle-initial': 'uint8', '*cpu-throttle-increment': 'uint8', '*cpu-throttle-tailslow': 'bool', + '*cpu-periodic-throttle': 'bool', + '*cpu-periodic-throttle-interval': 'uint8', '*tls-creds': 'StrOrNull', '*tls-hostname': 'StrOrNull', '*tls-authz': 'StrOrNull', @@ -XXX,XX +XXX,XX @@ # be excessive at tail stage. The default value is false. (Since # 5.1) # +# @cpu-periodic-throttle: Make CPU throttling periodically. +# (Since 9.1) +# +# @cpu-periodic-throttle-interval: Interval of the periodic CPU throttling. +# (Since 9.1) +# # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data # channel. On the outgoing side of the migration, the credentials @@ -XXX,XX +XXX,XX @@ '*cpu-throttle-initial': 'uint8', '*cpu-throttle-increment': 'uint8', '*cpu-throttle-tailslow': 'bool', + '*cpu-periodic-throttle': 'bool', + '*cpu-periodic-throttle-interval': 'uint8', '*tls-creds': 'str', '*tls-hostname': 'str', '*tls-authz': 'str', -- 2.39.1
The original migration information dirty-sync-count could no longer reflect iteration count due to the introduction of periodic synchronization in the next commit; add the iteration count to compensate. Signed-off-by: Hyman Huang <yong.huang@smartx.com> --- migration/migration-stats.h | 4 ++++ migration/migration.c | 1 + migration/ram.c | 12 ++++++++---- qapi/migration.json | 6 +++++- tests/qtest/migration-test.c | 2 +- 5 files changed, 19 insertions(+), 6 deletions(-) diff --git a/migration/migration-stats.h b/migration/migration-stats.h index XXXXXXX..XXXXXXX 100644 --- a/migration/migration-stats.h +++ b/migration/migration-stats.h @@ -XXX,XX +XXX,XX @@ typedef struct { * Number of times we have synchronized guest bitmaps. */ Stat64 dirty_sync_count; + /* + * Number of migration iteration processed. + */ + Stat64 iteration_count; /* * Number of times zero copy failed to send any page using zero * copy. diff --git a/migration/migration.c b/migration/migration.c index XXXXXXX..XXXXXXX 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -XXX,XX +XXX,XX @@ static void populate_ram_info(MigrationInfo *info, MigrationState *s) info->ram->mbps = s->mbps; info->ram->dirty_sync_count = stat64_get(&mig_stats.dirty_sync_count); + info->ram->iteration_count = stat64_get(&mig_stats.iteration_count); info->ram->dirty_sync_missed_zero_copy = stat64_get(&mig_stats.dirty_sync_missed_zero_copy); info->ram->postcopy_requests = diff --git a/migration/ram.c b/migration/ram.c index XXXXXXX..XXXXXXX 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -XXX,XX +XXX,XX @@ static void xbzrle_cache_zero_page(ram_addr_t current_addr) /* We don't care if this fails to allocate a new cache page * as long as it updated an old one */ cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, - stat64_get(&mig_stats.dirty_sync_count)); + stat64_get(&mig_stats.iteration_count)); } #define ENCODING_FLAG_XBZRLE 0x1 @@ -XXX,XX +XXX,XX @@ static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, int encoded_len = 0, bytes_xbzrle; uint8_t *prev_cached_page; QEMUFile *file = pss->pss_channel; - uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); + uint64_t generation = stat64_get(&mig_stats.iteration_count); if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { xbzrle_counters.cache_miss++; @@ -XXX,XX +XXX,XX @@ static void migration_bitmap_sync(RAMState *rs, RAMBlock *block; int64_t end_time; + if (!periodic) { + stat64_add(&mig_stats.iteration_count, 1); + } + stat64_add(&mig_stats.dirty_sync_count, 1); if (!rs->time_last_bitmap_sync) { @@ -XXX,XX +XXX,XX @@ static void migration_bitmap_sync(RAMState *rs, rs->num_dirty_pages_period = 0; rs->bytes_xfer_prev = migration_transferred_bytes(); } - if (migrate_events()) { - uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); + if (!periodic && migrate_events()) { + uint64_t generation = stat64_get(&mig_stats.iteration_count); qapi_event_send_migration_pass(generation); } } diff --git a/qapi/migration.json b/qapi/migration.json index XXXXXXX..XXXXXXX 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -XXX,XX +XXX,XX @@ # between 0 and @dirty-sync-count * @multifd-channels. (since # 7.1) # +# @iteration-count: The number of iterations since migration started. +# (since 9.2) +# # Since: 0.14 ## { 'struct': 'MigrationStats', @@ -XXX,XX +XXX,XX @@ 'multifd-bytes': 'uint64', 'pages-per-second': 'uint64', 'precopy-bytes': 'uint64', 'downtime-bytes': 'uint64', 'postcopy-bytes': 'uint64', - 'dirty-sync-missed-zero-copy': 'uint64' } } + 'dirty-sync-missed-zero-copy': 'uint64', + 'iteration-count' : 'int' } } ## # @XBZRLECacheStats: diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index XXXXXXX..XXXXXXX 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -XXX,XX +XXX,XX @@ static int64_t read_migrate_property_int(QTestState *who, const char *property) static uint64_t get_migration_pass(QTestState *who) { - return read_ram_property_int(who, "dirty-sync-count"); + return read_ram_property_int(who, "iteration-count"); } static void read_blocktime(QTestState *who) -- 2.39.1
Provide useful utilities to manage the periodic_throttle_thread's lifespan. Additionally, to set up sync mode, provide periodic_throttle_setup. Signed-off-by: Hyman Huang <yong.huang@smartx.com> --- migration/ram.c | 98 +++++++++++++++++++++++++++++++++++++++++- migration/ram.h | 4 ++ migration/trace-events | 3 ++ 3 files changed, 104 insertions(+), 1 deletion(-) diff --git a/migration/ram.c b/migration/ram.c index XXXXXXX..XXXXXXX 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -XXX,XX +XXX,XX @@ struct RAMState { * RAM migration. */ unsigned int postcopy_bmap_sync_requested; + + /* Periodic throttle information */ + bool throttle_running; + QemuThread throttle_thread; }; typedef struct RAMState RAMState; @@ -XXX,XX +XXX,XX @@ static void migration_bitmap_sync(RAMState *rs, RAMBlock *block; int64_t end_time; - if (!periodic) { + if (periodic) { + /* Be careful that we don't synchronize too often */ + int64_t curr_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + if (curr_time < rs->time_last_bitmap_sync + 1000) { + return; + } + } else { stat64_add(&mig_stats.iteration_count, 1); } @@ -XXX,XX +XXX,XX @@ static void migration_bitmap_sync(RAMState *rs, } } +static void *periodic_throttle_thread(void *opaque) +{ + RAMState *rs = opaque; + bool skip_sleep = false; + int sleep_duration = migrate_periodic_throttle_interval(); + + rcu_register_thread(); + + while (qatomic_read(&rs->throttle_running)) { + int64_t curr_time; + /* + * The first iteration copies all memory anyhow and has no + * effect on guest performance, therefore omit it to avoid + * paying extra for the sync penalty. + */ + if (stat64_get(&mig_stats.iteration_count) <= 1) { + continue; + } + + if (!skip_sleep) { + sleep(sleep_duration); + } + + /* Be careful that we don't synchronize too often */ + curr_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + if (curr_time > rs->time_last_bitmap_sync + 1000) { + bql_lock(); + trace_migration_periodic_throttle(); + WITH_RCU_READ_LOCK_GUARD() { + migration_bitmap_sync(rs, false, true); + } + bql_unlock(); + skip_sleep = false; + } else { + skip_sleep = true; + } + } + + rcu_unregister_thread(); + + return NULL; +} + +void periodic_throttle_start(void) +{ + RAMState *rs = ram_state; + + if (!rs) { + return; + } + + if (qatomic_read(&rs->throttle_running)) { + return; + } + + trace_migration_periodic_throttle_start(); + + qatomic_set(&rs->throttle_running, 1); + qemu_thread_create(&rs->throttle_thread, + NULL, periodic_throttle_thread, + rs, QEMU_THREAD_JOINABLE); +} + +void periodic_throttle_stop(void) +{ + RAMState *rs = ram_state; + + if (!rs) { + return; + } + + if (!qatomic_read(&rs->throttle_running)) { + return; + } + + trace_migration_periodic_throttle_stop(); + + qatomic_set(&rs->throttle_running, 0); + qemu_thread_join(&rs->throttle_thread); +} + +void periodic_throttle_setup(bool enable) +{ + sync_mode = enable ? RAMBLOCK_SYN_MODERN : RAMBLOCK_SYN_LEGACY; +} + static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage) { Error *local_err = NULL; diff --git a/migration/ram.h b/migration/ram.h index XXXXXXX..XXXXXXX 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -XXX,XX +XXX,XX @@ void ram_write_tracking_prepare(void); int ram_write_tracking_start(void); void ram_write_tracking_stop(void); +/* Periodic throttle */ +void periodic_throttle_start(void); +void periodic_throttle_stop(void); +void periodic_throttle_setup(bool enable); #endif diff --git a/migration/trace-events b/migration/trace-events index XXXXXXX..XXXXXXX 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -XXX,XX +XXX,XX @@ get_queued_page_not_dirty(const char *block_name, uint64_t tmp_offset, unsigned migration_bitmap_sync_start(void) "" migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64 migration_bitmap_clear_dirty(char *str, uint64_t start, uint64_t size, unsigned long page) "rb %s start 0x%"PRIx64" size 0x%"PRIx64" page 0x%lx" +migration_periodic_throttle(void) "" +migration_periodic_throttle_start(void) "" +migration_periodic_throttle_stop(void) "" migration_throttle(void) "" migration_dirty_limit_guest(int64_t dirtyrate) "guest dirty page rate limit %" PRIi64 " MB/s" ram_discard_range(const char *rbname, uint64_t start, size_t len) "%s: start: %" PRIx64 " %zx" -- 2.39.1
When VM is configured with huge memory, the current throttle logic doesn't look like to scale, because migration_trigger_throttle() is only called for each iteration, so it won't be invoked for a long time if one iteration can take a long time. The periodic sync and throttle aims to fix the above issue by synchronizing the remote dirty bitmap and triggering the throttle periodically. This is a trade-off between synchronization overhead and CPU throttle impact. Signed-off-by: Hyman Huang <yong.huang@smartx.com> --- migration/migration.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index XXXXXXX..XXXXXXX 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -XXX,XX +XXX,XX @@ static void migrate_fd_cleanup(MigrationState *s) qemu_thread_join(&s->thread); s->migration_thread_running = false; } + if (migrate_periodic_throttle()) { + periodic_throttle_stop(); + } bql_lock(); multifd_send_shutdown(); @@ -XXX,XX +XXX,XX @@ static MigIterateState migration_iteration_run(MigrationState *s) if ((!pending_size || pending_size < s->threshold_size) && can_switchover) { trace_migration_thread_low_pending(pending_size); + if (migrate_periodic_throttle()) { + periodic_throttle_stop(); + } migration_completion(s); return MIG_ITERATE_BREAK; } @@ -XXX,XX +XXX,XX @@ static void *migration_thread(void *opaque) ret = qemu_savevm_state_setup(s->to_dst_file, &local_err); bql_unlock(); + if (migrate_periodic_throttle()) { + periodic_throttle_setup(true); + periodic_throttle_start(); + } + qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_ACTIVE); -- 2.39.1
To make sure periodic throttle feature doesn't regression any features and functionalities, enable this feature in the auto-converge migration test. Signed-off-by: Hyman Huang <yong.huang@smartx.com> --- tests/qtest/migration-test.c | 56 +++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index XXXXXXX..XXXXXXX 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -XXX,XX +XXX,XX @@ static uint64_t get_migration_pass(QTestState *who) return read_ram_property_int(who, "iteration-count"); } +static uint64_t get_migration_dirty_sync_count(QTestState *who) +{ + return read_ram_property_int(who, "dirty-sync-count"); +} + static void read_blocktime(QTestState *who) { QDict *rsp_return; @@ -XXX,XX +XXX,XX @@ typedef struct { PostcopyRecoveryFailStage postcopy_recovery_fail_stage; } MigrateCommon; +typedef struct { + /* CPU throttle parameters */ + bool periodic; +} AutoConvergeArgs; + static int test_migrate_start(QTestState **from, QTestState **to, const char *uri, MigrateStart *args) { @@ -XXX,XX +XXX,XX @@ static void test_validate_uri_channels_none_set(void) * To make things even worse, we need to run the initial stage at * 3MB/s so we enter autoconverge even when host is (over)loaded. */ -static void test_migrate_auto_converge(void) +static void test_migrate_auto_converge_args(AutoConvergeArgs *input_args) { g_autofree char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs); MigrateStart args = {}; QTestState *from, *to; int64_t percentage; + bool periodic = (input_args && input_args->periodic); /* * We want the test to be stable and as fast as possible. @@ -XXX,XX +XXX,XX @@ static void test_migrate_auto_converge(void) * so we need to decrease a bandwidth. */ const int64_t init_pct = 5, inc_pct = 25, max_pct = 95; + const int64_t periodic_throttle_interval = 2; if (test_migrate_start(&from, &to, uri, &args)) { return; @@ -XXX,XX +XXX,XX @@ static void test_migrate_auto_converge(void) migrate_set_parameter_int(from, "cpu-throttle-increment", inc_pct); migrate_set_parameter_int(from, "max-cpu-throttle", max_pct); + if (periodic) { + migrate_set_parameter_bool(from, "cpu-periodic-throttle", true); + migrate_set_parameter_int(from, "cpu-periodic-throttle-interval", + periodic_throttle_interval); + } + /* * Set the initial parameters so that the migration could not converge * without throttling. @@ -XXX,XX +XXX,XX @@ static void test_migrate_auto_converge(void) } while (true); /* The first percentage of throttling should be at least init_pct */ g_assert_cmpint(percentage, >=, init_pct); + + if (periodic) { + /* + * Check if periodic sync take effect, set the timeout with 20s + * (max_try_count * 1s), if extra sync doesn't show up, fail test. + */ + uint64_t iteration_count, dirty_sync_count; + bool extra_sync = false; + int max_try_count = 20; + + /* Check if periodic sync take effect */ + while (--max_try_count) { + usleep(1000 * 1000); + iteration_count = get_migration_pass(from); + dirty_sync_count = get_migration_dirty_sync_count(from); + if (dirty_sync_count > iteration_count) { + extra_sync = true; + break; + } + } + g_assert(extra_sync); + } + /* Now, when we tested that throttling works, let it converge */ migrate_ensure_converge(from); @@ -XXX,XX +XXX,XX @@ static void test_migrate_auto_converge(void) test_migrate_end(from, to, true); } +static void test_migrate_auto_converge(void) +{ + test_migrate_auto_converge_args(NULL); +} + +static void test_migrate_auto_converge_periodic_throttle(void) +{ + AutoConvergeArgs args = {.periodic = true}; + test_migrate_auto_converge_args(&args); +} + static void * test_migrate_precopy_tcp_multifd_start_common(QTestState *from, QTestState *to, @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) if (g_test_slow()) { migration_test_add("/migration/auto_converge", test_migrate_auto_converge); + migration_test_add("/migration/auto_converge_periodic_throttle", + test_migrate_auto_converge_periodic_throttle); if (g_str_equal(arch, "x86_64") && has_kvm && kvm_dirty_ring_supported()) { migration_test_add("/migration/dirty_limit", -- 2.39.1
To enable the responsive throttle that will be implemented in the next commit, introduce the cpu-responsive-throttle parameter. Signed-off-by: Hyman Huang <yong.huang@smartx.com> --- migration/migration-hmp-cmds.c | 8 ++++++++ migration/options.c | 20 ++++++++++++++++++++ migration/options.h | 1 + qapi/migration.json | 16 +++++++++++++++- 4 files changed, 44 insertions(+), 1 deletion(-) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index XXXXXXX..XXXXXXX 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -XXX,XX +XXX,XX @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) MigrationParameter_str( MIGRATION_PARAMETER_CPU_PERIODIC_THROTTLE_INTERVAL), params->cpu_periodic_throttle_interval); + assert(params->has_cpu_responsive_throttle); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_CPU_RESPONSIVE_THROTTLE), + params->cpu_responsive_throttle ? "on" : "off"); assert(params->has_max_cpu_throttle); monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_MAX_CPU_THROTTLE), @@ -XXX,XX +XXX,XX @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_cpu_periodic_throttle_interval = true; visit_type_uint8(v, param, &p->cpu_periodic_throttle_interval, &err); break; + case MIGRATION_PARAMETER_CPU_RESPONSIVE_THROTTLE: + p->has_cpu_responsive_throttle = true; + visit_type_bool(v, param, &p->cpu_responsive_throttle, &err); + break; case MIGRATION_PARAMETER_MAX_CPU_THROTTLE: p->has_max_cpu_throttle = true; visit_type_uint8(v, param, &p->max_cpu_throttle, &err); diff --git a/migration/options.c b/migration/options.c index XXXXXXX..XXXXXXX 100644 --- a/migration/options.c +++ b/migration/options.c @@ -XXX,XX +XXX,XX @@ Property migration_properties[] = { DEFINE_PROP_UINT8("x-cpu-periodic-throttle-interval", MigrationState, parameters.cpu_periodic_throttle_interval, DEFAULT_MIGRATE_CPU_PERIODIC_THROTTLE_INTERVAL), + DEFINE_PROP_BOOL("x-cpu-responsive-throttle", MigrationState, + parameters.cpu_responsive_throttle, false), DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState, parameters.max_bandwidth, MAX_THROTTLE), DEFINE_PROP_SIZE("avail-switchover-bandwidth", MigrationState, @@ -XXX,XX +XXX,XX @@ bool migrate_periodic_throttle(void) return s->parameters.cpu_periodic_throttle; } +bool migrate_responsive_throttle(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.cpu_responsive_throttle; +} + bool migrate_cpu_throttle_tailslow(void) { MigrationState *s = migrate_get_current(); @@ -XXX,XX +XXX,XX @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->has_cpu_periodic_throttle_interval = true; params->cpu_periodic_throttle_interval = s->parameters.cpu_periodic_throttle_interval; + params->has_cpu_responsive_throttle = true; + params->cpu_responsive_throttle = s->parameters.cpu_responsive_throttle; params->tls_creds = g_strdup(s->parameters.tls_creds); params->tls_hostname = g_strdup(s->parameters.tls_hostname); params->tls_authz = g_strdup(s->parameters.tls_authz ? @@ -XXX,XX +XXX,XX @@ void migrate_params_init(MigrationParameters *params) params->has_cpu_throttle_tailslow = true; params->has_cpu_periodic_throttle = true; params->has_cpu_periodic_throttle_interval = true; + params->has_cpu_responsive_throttle = true; params->has_max_bandwidth = true; params->has_downtime_limit = true; params->has_x_checkpoint_delay = true; @@ -XXX,XX +XXX,XX @@ static void migrate_params_test_apply(MigrateSetParameters *params, params->cpu_periodic_throttle_interval; } + if (params->has_cpu_responsive_throttle) { + dest->cpu_responsive_throttle = params->cpu_responsive_throttle; + } + if (params->tls_creds) { assert(params->tls_creds->type == QTYPE_QSTRING); dest->tls_creds = params->tls_creds->u.s; @@ -XXX,XX +XXX,XX @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) params->cpu_periodic_throttle_interval; } + if (params->has_cpu_responsive_throttle) { + s->parameters.cpu_responsive_throttle = params->cpu_responsive_throttle; + } + if (params->tls_creds) { g_free(s->parameters.tls_creds); assert(params->tls_creds->type == QTYPE_QSTRING); diff --git a/migration/options.h b/migration/options.h index XXXXXXX..XXXXXXX 100644 --- a/migration/options.h +++ b/migration/options.h @@ -XXX,XX +XXX,XX @@ uint8_t migrate_cpu_throttle_increment(void); uint8_t migrate_cpu_throttle_initial(void); uint8_t migrate_periodic_throttle_interval(void); bool migrate_periodic_throttle(void); +bool migrate_responsive_throttle(void); bool migrate_cpu_throttle_tailslow(void); bool migrate_direct_io(void); uint64_t migrate_downtime_limit(void); diff --git a/qapi/migration.json b/qapi/migration.json index XXXXXXX..XXXXXXX 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -XXX,XX +XXX,XX @@ # @cpu-periodic-throttle-interval: Interval of the periodic CPU throttling. # (Since 9.1) # +# @cpu-responsive-throttle: Make CPU throttling more responsively by +# introduce an extra detection metric of +# migration convergence. (Since 9.1) +# # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data # channel. On the outgoing side of the migration, the credentials @@ -XXX,XX +XXX,XX @@ 'throttle-trigger-threshold', 'cpu-throttle-initial', 'cpu-throttle-increment', 'cpu-throttle-tailslow', 'cpu-periodic-throttle', - 'cpu-periodic-throttle-interval', + 'cpu-periodic-throttle-interval', 'cpu-responsive-throttle', 'tls-creds', 'tls-hostname', 'tls-authz', 'max-bandwidth', 'avail-switchover-bandwidth', 'downtime-limit', { 'name': 'x-checkpoint-delay', 'features': [ 'unstable' ] }, @@ -XXX,XX +XXX,XX @@ # @cpu-periodic-throttle-interval: Interval of the periodic CPU throttling. # (Since 9.1) # +# @cpu-responsive-throttle: Make CPU throttling more responsively by +# introduce an extra detection metric of +# migration convergence. (Since 9.1) +# # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data # channel. On the outgoing side of the migration, the credentials @@ -XXX,XX +XXX,XX @@ '*cpu-throttle-tailslow': 'bool', '*cpu-periodic-throttle': 'bool', '*cpu-periodic-throttle-interval': 'uint8', + '*cpu-responsive-throttle': 'bool', '*tls-creds': 'StrOrNull', '*tls-hostname': 'StrOrNull', '*tls-authz': 'StrOrNull', @@ -XXX,XX +XXX,XX @@ # @cpu-periodic-throttle-interval: Interval of the periodic CPU throttling. # (Since 9.1) # +# @cpu-responsive-throttle: Make CPU throttling more responsively by +# introduce an extra detection metric of +# migration convergence. (Since 9.1) +# # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data # channel. On the outgoing side of the migration, the credentials @@ -XXX,XX +XXX,XX @@ '*cpu-throttle-tailslow': 'bool', '*cpu-periodic-throttle': 'bool', '*cpu-periodic-throttle-interval': 'uint8', + '*cpu-responsive-throttle': 'bool', '*tls-creds': 'str', '*tls-hostname': 'str', '*tls-authz': 'str', -- 2.39.1
Currently, the convergence algorithm determines that the migration cannot converge according to the following principle: The dirty pages generated in current iteration exceed a specific percentage (throttle-trigger-threshold, 50 by default) of the number of transmissions. Let's refer to this criteria as the "dirty rate". If this criteria is met more than or equal to twice (dirty_rate_high_cnt >= 2), the throttle percentage increased. In most cases, above implementation is appropriate. However, for a VM with high memory overload, each iteration is time-consuming. The VM's computing performance may be throttled at a high percentage and last for a long time due to the repeated confirmation behavior. Which may be intolerable for some computationally sensitive software in the VM. As the comment mentioned in the migration_trigger_throttle function, in order to avoid erroneous detection, the original algorithm confirms the criteria repeatedly. Put differently, the criteria does not need to be validated again once the detection is more reliable. In the refinement, in order to make the detection more accurate, we introduce another criteria, called the "dirty ratio" to determine the migration convergence. The "dirty ratio" is the ratio of bytes_xfer_period and bytes_dirty_period. When the algorithm repeatedly detects that the "dirty ratio" of current sync is lower than the previous, the algorithm determines that the migration cannot converge. For the "dirty rate" and "dirty ratio", if one of the two criteria is met, the penalty percentage would be increased. This makes CPU throttle more responsively and therefor saves the time of the entire iteration and therefore reduces the time of VM performance degradation. In conclusion, this refinement significantly reduces the processing time required for the throttle percentage step to its maximum while the VM is under a high memory load. Signed-off-by: Hyman Huang <yong.huang@smartx.com> --- migration/ram.c | 55 ++++++++++++++++++++++++++++++++++-- migration/trace-events | 1 + tests/qtest/migration-test.c | 1 + 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index XXXXXXX..XXXXXXX 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -XXX,XX +XXX,XX @@ struct RAMState { /* Periodic throttle information */ bool throttle_running; QemuThread throttle_thread; + + /* + * Ratio of bytes_dirty_period and bytes_xfer_period in the previous + * sync. + */ + uint64_t dirty_ratio_pct; }; typedef struct RAMState RAMState; @@ -XXX,XX +XXX,XX @@ static void migration_dirty_limit_guest(void) trace_migration_dirty_limit_guest(quota_dirtyrate); } +static bool migration_dirty_ratio_high(RAMState *rs) +{ + static int dirty_ratio_high_cnt; + uint64_t threshold = migrate_throttle_trigger_threshold(); + uint64_t bytes_xfer_period = + migration_transferred_bytes() - rs->bytes_xfer_prev; + uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; + bool dirty_ratio_high = false; + uint64_t prev, curr; + + /* Calculate the dirty ratio percentage */ + curr = 100 * (bytes_dirty_period * 1.0 / bytes_xfer_period); + + prev = rs->dirty_ratio_pct; + rs->dirty_ratio_pct = curr; + + if (prev == 0) { + return false; + } + + /* + * If current dirty ratio is greater than previouse, determine + * that the migration do not converge. + */ + if (curr > threshold && curr >= prev) { + trace_migration_dirty_ratio_high(curr, prev); + dirty_ratio_high_cnt++; + } + + if (dirty_ratio_high_cnt >= 2) { + dirty_ratio_high = true; + dirty_ratio_high_cnt = 0; + } + + return dirty_ratio_high; +} + static void migration_trigger_throttle(RAMState *rs) { uint64_t threshold = migrate_throttle_trigger_threshold(); @@ -XXX,XX +XXX,XX @@ static void migration_trigger_throttle(RAMState *rs) migration_transferred_bytes() - rs->bytes_xfer_prev; uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; + bool dirty_ratio_high = false; + + if (migrate_responsive_throttle() && (bytes_xfer_period != 0)) { + dirty_ratio_high = migration_dirty_ratio_high(rs); + } /* * The following detection logic can be refined later. For now: @@ -XXX,XX +XXX,XX @@ static void migration_trigger_throttle(RAMState *rs) * twice, start or increase throttling. */ if ((bytes_dirty_period > bytes_dirty_threshold) && - (++rs->dirty_rate_high_cnt >= 2)) { - rs->dirty_rate_high_cnt = 0; + ((++rs->dirty_rate_high_cnt >= 2) || dirty_ratio_high)) { + + rs->dirty_rate_high_cnt = + rs->dirty_rate_high_cnt >= 2 ? 0 : rs->dirty_rate_high_cnt; + if (migrate_auto_converge()) { trace_migration_throttle(); mig_throttle_guest_down(bytes_dirty_period, diff --git a/migration/trace-events b/migration/trace-events index XXXXXXX..XXXXXXX 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -XXX,XX +XXX,XX @@ get_queued_page_not_dirty(const char *block_name, uint64_t tmp_offset, unsigned migration_bitmap_sync_start(void) "" migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64 migration_bitmap_clear_dirty(char *str, uint64_t start, uint64_t size, unsigned long page) "rb %s start 0x%"PRIx64" size 0x%"PRIx64" page 0x%lx" +migration_dirty_ratio_high(uint64_t cur, uint64_t prev) "current ratio: %" PRIu64 " previous ratio: %" PRIu64 migration_periodic_throttle(void) "" migration_periodic_throttle_start(void) "" migration_periodic_throttle_stop(void) "" diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index XXXXXXX..XXXXXXX 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -XXX,XX +XXX,XX @@ static void test_migrate_auto_converge_args(AutoConvergeArgs *input_args) migrate_set_parameter_int(from, "cpu-throttle-initial", init_pct); migrate_set_parameter_int(from, "cpu-throttle-increment", inc_pct); migrate_set_parameter_int(from, "max-cpu-throttle", max_pct); + migrate_set_parameter_bool(from, "cpu-responsive-throttle", true); if (periodic) { migrate_set_parameter_bool(from, "cpu-periodic-throttle", true); -- 2.39.1
Despite the fact that the responsive CPU throttle is enabled, the dirty sync count may not always increase because this is an optimization that might not happen in any situation. This test case just making sure it doesn't interfere with any current functionality. Signed-off-by: Hyman Huang <yong.huang@smartx.com> --- tests/qtest/migration-test.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index XXXXXXX..XXXXXXX 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -XXX,XX +XXX,XX @@ typedef struct { typedef struct { /* CPU throttle parameters */ bool periodic; + bool responsive; } AutoConvergeArgs; static int test_migrate_start(QTestState **from, QTestState **to, @@ -XXX,XX +XXX,XX @@ static void test_migrate_auto_converge_args(AutoConvergeArgs *input_args) QTestState *from, *to; int64_t percentage; bool periodic = (input_args && input_args->periodic); + bool responsive = (input_args && input_args->responsive); /* * We want the test to be stable and as fast as possible. @@ -XXX,XX +XXX,XX @@ static void test_migrate_auto_converge_args(AutoConvergeArgs *input_args) periodic_throttle_interval); } + if (responsive) { + /* + * The dirty-sync-count may not always go down while using responsive + * throttle because it is an optimization and may not take effect in + * any scenario. Just making sure this feature doesn't break any + * existing functionality by turning it on. + */ + migrate_set_parameter_bool(from, "cpu-responsive-throttle", true); + } + /* * Set the initial parameters so that the migration could not converge * without throttling. @@ -XXX,XX +XXX,XX @@ static void test_migrate_auto_converge_periodic_throttle(void) test_migrate_auto_converge_args(&args); } +static void test_migrate_auto_converge_responsive_throttle(void) +{ + AutoConvergeArgs args = {.responsive = true}; + test_migrate_auto_converge_args(&args); +} + static void * test_migrate_precopy_tcp_multifd_start_common(QTestState *from, QTestState *to, @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) test_migrate_auto_converge); migration_test_add("/migration/auto_converge_periodic_throttle", test_migrate_auto_converge_periodic_throttle); + migration_test_add("/migration/auto_converge_responsive_throttle", + test_migrate_auto_converge_responsive_throttle); if (g_str_equal(arch, "x86_64") && has_kvm && kvm_dirty_ring_supported()) { migration_test_add("/migration/dirty_limit", -- 2.39.1
From: Hyman Huang <yong.huang@smartx.com> v2: 1. background sync - Throw out the idea of "not updating the bitmap" when the RAMBlock of the RAM list is iterated during migration; re-implement the background RAM dirty sync using Peter's updated method. 2. responsive throttle - Rename the "cpu-responsive throttle" to "cpu-throttle-responsive," as suggested by Fabiano. For the background sync refinement, The new method achieves a much shorter migration time overall and is much simpler than the previous approach. The test approach is the same as in version 1, but we use a new test host with 1.93 Gb bandwidth. Please refer to the information included in version 1 below for more details. The new test result for background dirty sync is as follows; we didn't test the responsive throttle feature because its logic is still the same. For each scenario, we tested twice and used 3500MB as high memory load. ramsize: 150MB |------------+-----------+----------+-----------+--------------| | | totaltime | downtime | iteration | max throttle | | | (ms) | (ms) | count | percent | |------------+-----------+----------+-----------+--------------| | original | 57579 | 577 | 107 | 90% | | | 57434 | 466 | 110 | 90% | |------------+-----------+----------+-----------+--------------| | background | 57728 | 488 | 102 | 90% | | sync | 60223 | 447 | 115 | 90% | |------------+-----------+----------+-----------+--------------| ramsize: 3000MB |------------+-----------+----------+-----------+--------------| | | totaltime | downtime | iteration | max throttle | | | (ms) | (ms) | count | percent | |------------+-----------+----------+-----------+--------------| | original | 291275 | 416 | 26 | 99% | | | 290907 | 567 | 27 | 99% | |------------+-----------+----------+-----------+--------------| | background | 210912 | 465 | 30 | 99% | | sync | 203890 | 442 | 28 | 99% | |------------+-----------+----------+-----------+--------------| ramsize: 3500MB |------------+-----------+----------+-----------+--------------| | | totaltime | downtime | iteration | max throttle | | | (ms) | (ms) | count | percent | |------------+-----------+----------+-----------+--------------| | original | 338445 | 491 | 28 | 99% | | | 341168 | 496 | 28 | 99% | |------------+-----------+----------+-----------+--------------| | background | 209099 | 440 | 27 | 99% | | sync | 206241 | 467 | 30 | 99% | |------------+-----------+----------+-----------+--------------| v1: This is the first version for auto-converge refinements; refer to the following link for details about the RFC version: https://patchew.org/QEMU/cover.1725891841.git.yong.huang@smartx.com/ This series introduces two refinements called "background sync" and "responsive throttle," respectively. 1. background sync: The original auto-converge throttle logic doesn't look like it will scale because migration_trigger_throttle() is only called for each iteration, so it won't be invoked for a long time if one iteration can take a long time. The background sync would fix this issue by implementing the background dirty bitmap sync and throttle automatically once detect that the iteration lasts a long time during the migration. The background sync is implemented transparently, and there is no new-added interface for upper apps. 2. responsive throttle: The original auto-converge throttle logic determines if the migration is convergent by one criteria, and if the iteration fits twice, then launch the CPU throttle or increase the throttle percentage. This results in that the migration_trigger_throttle() won't be invoked for a long time if one iteration can take a long time too. The responsive throttle introduce one more criteria to assist detecting the convergence of the migration, if either of the two criteria is met, migration_trigger_throttle() would be called. This also makes it more likely that the CPU throttle will be activated, thereby accelerating the migration process. The responsive throttle provides the 'cpu-responsive-throttle' option to enable this feature. We test this two features with the following environment: a. Test tool: guestperf Refer to the following link to see details: https://github.com/qemu/qemu/tree/master/tests/migration/guestperf b. Test VM scale: CPU: 16; Memory: 100GB c. Average bandwidth between source and destination for migration: 1.59 Gbits/sec We use stress tool contained in the initrd-stress.img to update ramsize MB on every CPU in guest, refer to the following link to see the source code: https://github.com/qemu/qemu/blob/master/tests/migration/stress.c The following command is executed to compare our refined QEMU with the original QEMU: # python3.6 guestperf.py --binary /path/to/qemu-kvm --cpus 16 \ --mem 100 --max-iters 200 --max-time 1200 --dst-host {dst_ip} \ --kernel /path/to/vmlinuz --initrd /path/to/initrd-stress.img \ --transport tcp --downtime 500 --auto-converge --auto-converge-step 10 \ --verbose --stress-mem {ramsize} We set ramsize to 150MB to simulate the light load, 3000MB as moderate load and 5000MB as heavy load. Test cases were executed three times in each scenario. The following data shows the migration test results with an increase in stress. ramsize: 150MB |------------+-----------+----------+-----------+--------------| | | totaltime | downtime | iteration | max throttle | | | (ms) | (ms) | count | percent | |------------+-----------+----------+-----------+--------------| | original | 123685 | 490 | 87 | 99% | | | 116249 | 542 | 45 | 60% | | | 107772 | 587 | 8 | 0% | |------------+-----------+----------+-----------+--------------| | background | 113744 | 1654 | 16 | 20% | | sync | 122623 | 758 | 60 | 80% | | | 112668 | 547 | 23 | 20% | |------------+-----------+----------+-----------+--------------| | background | 113660 | 573 | 5 | 0% | | sync + | 109357 | 576 | 6 | 0% | | responsive | 126792 | 494 | 37 | 99% | | throttle | | | | | |------------+-----------+----------+-----------+--------------| ramsize: 3000MB |------------+-----------+----------+-----------+--------------| | | totaltime | downtime | iteration | max throttle | | | (ms) | (ms) | count | percent | |------------+-----------+----------+-----------+--------------| | original | 404398 | 515 | 26 | 99% | | | 392552 | 528 | 25 | 99% | | | 400113 | 447 | 24 | 99% | |------------+-----------+----------+-----------+--------------| | background | 239151 | 681 | 25 | 99% | | sync | 295047 | 587 | 41 | 99% | | | 289936 | 681 | 34 | 99% | |------------+-----------+----------+-----------+--------------| | background | 212786 | 487 | 22 | 99% | | sync + | 225246 | 666 | 23 | 99% | | responsive | 244053 | 572 | 27 | 99% | | throttle | | | | | |------------+-----------+----------+-----------+--------------| ramsize: 5000MB |------------+-----------+----------+-----------+--------------| | | totaltime | downtime | iteration | max throttle | | | (ms) | (ms) | count | percent | |------------+-----------+----------+-----------+--------------| | original | 566357 | 644 | 22 | 99% | | | 607471 | 320 | 23 | 99% | | | 603136 | 417 | 22 | 99% | |------------+-----------+----------+-----------+--------------| | background | 284605 | 793 | 27 | 99% | | sync | 272270 | 668 | 28 | 99% | | | 267543 | 545 | 28 | 99% | |------------+-----------+----------+-----------+--------------| | background | 226446 | 413 | 22 | 99% | | sync + | 232082 | 494 | 23 | 99% | | responsive | 269863 | 533 | 23 | 99% | | throttle | | | | | |------------+-----------+----------+-----------+--------------| To summarize the data above, any data that implies negative optimization does not appear, the refinement saves the total time significantly and, therefore, shortens the duration of the guest performance degradation. Additionally, we examined the memory performance curves generated from the test case results above; while no negative optimization is there, but the performance degradation occurs more quickly. Since it is inconvenient to display the graphic data, one can independently verify it. Hyman Huang (3): migration: Support background ramblock dirty sync qapi/migration: Introduce cpu-throttle-responsive parameter migration: Support responsive CPU throttle include/migration/misc.h | 3 + migration/migration-hmp-cmds.c | 8 +++ migration/migration.c | 11 +++ migration/options.c | 20 ++++++ migration/options.h | 1 + migration/ram.c | 119 ++++++++++++++++++++++++++++++++- migration/ram.h | 3 + migration/trace-events | 2 + qapi/migration.json | 16 ++++- system/cpu-timers.c | 2 + tests/qtest/migration-test.c | 30 +++++++++ 11 files changed, 212 insertions(+), 3 deletions(-) -- 2.27.0
From: Hyman Huang <yong.huang@smartx.com> When VM is configured with huge memory, the current throttle logic doesn't look like to scale, because migration_trigger_throttle() is only called for each iteration, so it won't be invoked for a long time if one iteration can take a long time. The background dirty sync aim to fix the above issue by synchronizing the ramblock from remote dirty bitmap and, when necessary, triggering the CPU throttle multiple times during a long iteration. This is a trade-off between synchronization overhead and CPU throttle impact. Signed-off-by: Hyman Huang <yong.huang@smartx.com> --- include/migration/misc.h | 3 ++ migration/migration.c | 11 +++++++ migration/ram.c | 64 ++++++++++++++++++++++++++++++++++++ migration/ram.h | 3 ++ migration/trace-events | 1 + system/cpu-timers.c | 2 ++ tests/qtest/migration-test.c | 29 ++++++++++++++++ 7 files changed, 113 insertions(+) diff --git a/include/migration/misc.h b/include/migration/misc.h index XXXXXXX..XXXXXXX 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -XXX,XX +XXX,XX @@ bool migration_in_bg_snapshot(void); /* migration/block-dirty-bitmap.c */ void dirty_bitmap_mig_init(void); +/* migration/ram.c */ +void bg_ram_dirty_sync_init(void); + #endif diff --git a/migration/migration.c b/migration/migration.c index XXXXXXX..XXXXXXX 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -XXX,XX +XXX,XX @@ static void migration_iteration_finish(MigrationState *s) { /* If we enabled cpu throttling for auto-converge, turn it off. */ cpu_throttle_stop(); + if (migrate_auto_converge()) { + bg_ram_dirty_sync_timer_enable(false); + } bql_lock(); switch (s->state) { @@ -XXX,XX +XXX,XX @@ static void *migration_thread(void *opaque) trace_migration_thread_setup_complete(); + /* + * Tick the background ramblock dirty sync timer after setup + * phase. + */ + if (migrate_auto_converge()) { + bg_ram_dirty_sync_timer_enable(true); + } + while (migration_is_active()) { if (urgent || !migration_rate_exceeded(s->to_dst_file)) { MigIterateState iter_state = migration_iteration_run(s); diff --git a/migration/ram.c b/migration/ram.c index XXXXXXX..XXXXXXX 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -XXX,XX +XXX,XX @@ */ #define MAPPED_RAM_LOAD_BUF_SIZE 0x100000 +/* Background ramblock dirty sync trigger every five seconds */ +#define BG_RAM_SYNC_TIMESLICE_MS 5000 +#define BG_RAM_SYNC_TIMER_INTERVAL_MS 1000 + +static QEMUTimer *bg_ram_dirty_sync_timer; + XBZRLECacheStats xbzrle_counters; /* used by the search for pages to send */ @@ -XXX,XX +XXX,XX @@ static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, } } +static void bg_ram_dirty_sync_timer_tick(void *opaque) +{ + static int prev_pct; + static uint64_t prev_sync_cnt = 2; + uint64_t sync_cnt = stat64_get(&mig_stats.dirty_sync_count); + int cur_pct = cpu_throttle_get_percentage(); + + if (prev_pct && !cur_pct) { + /* CPU throttle timer stopped, so do we */ + return; + } + + /* + * The first iteration copies all memory anyhow and has no + * effect on guest performance, therefore omit it to avoid + * paying extra for the sync penalty. + */ + if (sync_cnt <= 1) { + goto end; + } + + if (sync_cnt == prev_sync_cnt) { + int64_t curr_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + assert(ram_state); + if ((curr_time - ram_state->time_last_bitmap_sync) > + BG_RAM_SYNC_TIMESLICE_MS) { + trace_bg_ram_dirty_sync(); + WITH_RCU_READ_LOCK_GUARD() { + migration_bitmap_sync_precopy(ram_state, false); + } + } + } + +end: + prev_sync_cnt = stat64_get(&mig_stats.dirty_sync_count); + prev_pct = cpu_throttle_get_percentage(); + + timer_mod(bg_ram_dirty_sync_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + + BG_RAM_SYNC_TIMER_INTERVAL_MS); +} + +void bg_ram_dirty_sync_timer_enable(bool enable) +{ + if (enable) { + bg_ram_dirty_sync_timer_tick(NULL); + } else { + timer_del(bg_ram_dirty_sync_timer); + } +} + +void bg_ram_dirty_sync_init(void) +{ + bg_ram_dirty_sync_timer = + timer_new_ms(QEMU_CLOCK_VIRTUAL_RT, + bg_ram_dirty_sync_timer_tick, NULL); +} + static RAMBlockNotifier ram_mig_ram_notifier = { .ram_block_resized = ram_mig_ram_block_resized, }; diff --git a/migration/ram.h b/migration/ram.h index XXXXXXX..XXXXXXX 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -XXX,XX +XXX,XX @@ void ram_write_tracking_prepare(void); int ram_write_tracking_start(void); void ram_write_tracking_stop(void); +/* Background ramblock dirty sync */ +void bg_ram_dirty_sync_timer_enable(bool enable); + #endif diff --git a/migration/trace-events b/migration/trace-events index XXXXXXX..XXXXXXX 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -XXX,XX +XXX,XX @@ put_qlist_end(const char *field_name, const char *vmsd_name) "%s(%s)" qemu_file_fclose(void) "" # ram.c +bg_ram_dirty_sync(void) "" get_queued_page(const char *block_name, uint64_t tmp_offset, unsigned long page_abs) "%s/0x%" PRIx64 " page_abs=0x%lx" get_queued_page_not_dirty(const char *block_name, uint64_t tmp_offset, unsigned long page_abs) "%s/0x%" PRIx64 " page_abs=0x%lx" migration_bitmap_sync_start(void) "" diff --git a/system/cpu-timers.c b/system/cpu-timers.c index XXXXXXX..XXXXXXX 100644 --- a/system/cpu-timers.c +++ b/system/cpu-timers.c @@ -XXX,XX +XXX,XX @@ #include "qemu/osdep.h" #include "qemu/cutils.h" #include "migration/vmstate.h" +#include "migration/misc.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "sysemu/cpus.h" @@ -XXX,XX +XXX,XX @@ void cpu_timers_init(void) vmstate_register(NULL, 0, &vmstate_timers, &timers_state); cpu_throttle_init(); + bg_ram_dirty_sync_init(); } diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index XXXXXXX..XXXXXXX 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -XXX,XX +XXX,XX @@ static void migrate_ensure_converge(QTestState *who) migrate_set_parameter_int(who, "downtime-limit", 30 * 1000); } +static void migrate_ensure_iteration_last_long(QTestState *who) +{ + /* Set 10Byte/s bandwidth limit to make the iteration last long enough */ + migrate_set_parameter_int(who, "max-bandwidth", 10); +} + /* * Our goal is to ensure that we run a single full migration * iteration, and also dirty memory, ensuring that at least @@ -XXX,XX +XXX,XX @@ static void test_migrate_auto_converge(void) * so we need to decrease a bandwidth. */ const int64_t init_pct = 5, inc_pct = 25, max_pct = 95; + uint64_t prev_dirty_sync_cnt, dirty_sync_cnt; if (test_migrate_start(&from, &to, uri, &args)) { return; @@ -XXX,XX +XXX,XX @@ static void test_migrate_auto_converge(void) } while (true); /* The first percentage of throttling should be at least init_pct */ g_assert_cmpint(percentage, >=, init_pct); + + /* Make sure the iteration last a long time enough */ + migrate_ensure_iteration_last_long(from); + + /* + * End the loop when the dirty sync count greater than 1. + */ + while ((dirty_sync_cnt = get_migration_pass(from)) < 2) { + usleep(1000 * 1000); + } + + prev_dirty_sync_cnt = dirty_sync_cnt; + + /* + * The dirty sync count must changes in 5 seconds, here we + * plus 1 second as error value. + */ + sleep(5 + 1); + + dirty_sync_cnt = get_migration_pass(from); + g_assert_cmpint(dirty_sync_cnt, != , prev_dirty_sync_cnt); + /* Now, when we tested that throttling works, let it converge */ migrate_ensure_converge(from); -- 2.27.0
From: Hyman Huang <yong.huang@smartx.com> To enable the responsive throttle that will be implemented in the next commit, introduce the cpu-responsive-throttle parameter. Signed-off-by: Hyman Huang <yong.huang@smartx.com> --- migration/migration-hmp-cmds.c | 8 ++++++++ migration/options.c | 20 ++++++++++++++++++++ migration/options.h | 1 + qapi/migration.json | 16 +++++++++++++++- 4 files changed, 44 insertions(+), 1 deletion(-) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index XXXXXXX..XXXXXXX 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -XXX,XX +XXX,XX @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %s\n", MigrationParameter_str(MIGRATION_PARAMETER_CPU_THROTTLE_TAILSLOW), params->cpu_throttle_tailslow ? "on" : "off"); + assert(params->has_cpu_throttle_responsive); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_CPU_THROTTLE_RESPONSIVE), + params->cpu_throttle_responsive ? "on" : "off"); assert(params->has_max_cpu_throttle); monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_MAX_CPU_THROTTLE), @@ -XXX,XX +XXX,XX @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_cpu_throttle_tailslow = true; visit_type_bool(v, param, &p->cpu_throttle_tailslow, &err); break; + case MIGRATION_PARAMETER_CPU_THROTTLE_RESPONSIVE: + p->has_cpu_throttle_responsive = true; + visit_type_bool(v, param, &p->cpu_throttle_responsive, &err); + break; case MIGRATION_PARAMETER_MAX_CPU_THROTTLE: p->has_max_cpu_throttle = true; visit_type_uint8(v, param, &p->max_cpu_throttle, &err); diff --git a/migration/options.c b/migration/options.c index XXXXXXX..XXXXXXX 100644 --- a/migration/options.c +++ b/migration/options.c @@ -XXX,XX +XXX,XX @@ Property migration_properties[] = { DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT), DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState, parameters.cpu_throttle_tailslow, false), + DEFINE_PROP_BOOL("x-cpu-throttle-responsive", MigrationState, + parameters.cpu_throttle_responsive, false), DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState, parameters.max_bandwidth, MAX_THROTTLE), DEFINE_PROP_SIZE("avail-switchover-bandwidth", MigrationState, @@ -XXX,XX +XXX,XX @@ uint8_t migrate_cpu_throttle_initial(void) return s->parameters.cpu_throttle_initial; } +bool migrate_cpu_throttle_responsive(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.cpu_throttle_responsive; +} + bool migrate_cpu_throttle_tailslow(void) { MigrationState *s = migrate_get_current(); @@ -XXX,XX +XXX,XX @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->cpu_throttle_increment = s->parameters.cpu_throttle_increment; params->has_cpu_throttle_tailslow = true; params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow; + params->has_cpu_throttle_responsive = true; + params->cpu_throttle_responsive = s->parameters.cpu_throttle_responsive; params->tls_creds = g_strdup(s->parameters.tls_creds); params->tls_hostname = g_strdup(s->parameters.tls_hostname); params->tls_authz = g_strdup(s->parameters.tls_authz ? @@ -XXX,XX +XXX,XX @@ void migrate_params_init(MigrationParameters *params) params->has_cpu_throttle_initial = true; params->has_cpu_throttle_increment = true; params->has_cpu_throttle_tailslow = true; + params->has_cpu_throttle_responsive = true; params->has_max_bandwidth = true; params->has_downtime_limit = true; params->has_x_checkpoint_delay = true; @@ -XXX,XX +XXX,XX @@ static void migrate_params_test_apply(MigrateSetParameters *params, dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow; } + if (params->has_cpu_throttle_responsive) { + dest->cpu_throttle_responsive = params->cpu_throttle_responsive; + } + if (params->tls_creds) { assert(params->tls_creds->type == QTYPE_QSTRING); dest->tls_creds = params->tls_creds->u.s; @@ -XXX,XX +XXX,XX @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow; } + if (params->has_cpu_throttle_responsive) { + s->parameters.cpu_throttle_responsive = params->cpu_throttle_responsive; + } + if (params->tls_creds) { g_free(s->parameters.tls_creds); assert(params->tls_creds->type == QTYPE_QSTRING); diff --git a/migration/options.h b/migration/options.h index XXXXXXX..XXXXXXX 100644 --- a/migration/options.h +++ b/migration/options.h @@ -XXX,XX +XXX,XX @@ bool migrate_has_block_bitmap_mapping(void); uint32_t migrate_checkpoint_delay(void); uint8_t migrate_cpu_throttle_increment(void); uint8_t migrate_cpu_throttle_initial(void); +bool migrate_cpu_throttle_responsive(void); bool migrate_cpu_throttle_tailslow(void); bool migrate_direct_io(void); uint64_t migrate_downtime_limit(void); diff --git a/qapi/migration.json b/qapi/migration.json index XXXXXXX..XXXXXXX 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -XXX,XX +XXX,XX @@ # be excessive at tail stage. The default value is false. (Since # 5.1) # +# @cpu-throttle-responsive: Make CPU throttling more responsive by +# introduce an extra detection metric of +# migration convergence. (Since 9.2) +# # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data # channel. On the outgoing side of the migration, the credentials @@ -XXX,XX +XXX,XX @@ 'announce-rounds', 'announce-step', 'throttle-trigger-threshold', 'cpu-throttle-initial', 'cpu-throttle-increment', - 'cpu-throttle-tailslow', + 'cpu-throttle-tailslow', 'cpu-throttle-responsive', 'tls-creds', 'tls-hostname', 'tls-authz', 'max-bandwidth', 'avail-switchover-bandwidth', 'downtime-limit', { 'name': 'x-checkpoint-delay', 'features': [ 'unstable' ] }, @@ -XXX,XX +XXX,XX @@ # be excessive at tail stage. The default value is false. (Since # 5.1) # +# @cpu-throttle-responsive: Make CPU throttling more responsive by +# introduce an extra detection metric of +# migration convergence. (Since 9.1) +# # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data # channel. On the outgoing side of the migration, the credentials @@ -XXX,XX +XXX,XX @@ '*cpu-throttle-initial': 'uint8', '*cpu-throttle-increment': 'uint8', '*cpu-throttle-tailslow': 'bool', + '*cpu-throttle-responsive': 'bool', '*tls-creds': 'StrOrNull', '*tls-hostname': 'StrOrNull', '*tls-authz': 'StrOrNull', @@ -XXX,XX +XXX,XX @@ # be excessive at tail stage. The default value is false. (Since # 5.1) # +# @cpu-throttle-responsive: Make CPU throttling more responsive by +# introduce an extra detection metric of +# migration convergence. (Since 9.1) +# # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data # channel. On the outgoing side of the migration, the credentials @@ -XXX,XX +XXX,XX @@ '*cpu-throttle-initial': 'uint8', '*cpu-throttle-increment': 'uint8', '*cpu-throttle-tailslow': 'bool', + '*cpu-throttle-responsive': 'bool', '*tls-creds': 'str', '*tls-hostname': 'str', '*tls-authz': 'str', -- 2.27.0
From: Hyman Huang <yong.huang@smartx.com> Currently, the convergence algorithm determines that the migration cannot converge according to the following principle: The dirty pages generated in current iteration exceed a specific percentage (throttle-trigger-threshold, 50 by default) of the number of transmissions. Let's refer to this criteria as the "dirty rate". If this criteria is met more than or equal to twice (dirty_rate_high_cnt >= 2), the throttle percentage increased. In most cases, above implementation is appropriate. However, for a VM with high memory overload, each iteration is time-consuming. The VM's computing performance may be throttled at a high percentage and last for a long time due to the repeated confirmation behavior. Which may be intolerable for some computationally sensitive software in the VM. As the comment mentioned in the migration_trigger_throttle function, in order to avoid erroneous detection, the original algorithm confirms the criteria repeatedly. Put differently, the criteria does not need to be validated again once the detection is more reliable. In the refinement, in order to make the detection more accurate, we introduce another criteria, called the "dirty ratio" to determine the migration convergence. The "dirty ratio" is the ratio of bytes_xfer_period and bytes_dirty_period. When the algorithm repeatedly detects that the "dirty ratio" of current sync is lower than the previous, the algorithm determines that the migration cannot converge. For the "dirty rate" and "dirty ratio", if one of the two criteria is met, the penalty percentage would be increased. This makes CPU throttle more responsively and therefor saves the time of the entire iteration and therefore reduces the time of VM performance degradation. In conclusion, this refinement significantly reduces the processing time required for the throttle percentage step to its maximum while the VM is under a high memory load. Signed-off-by: Hyman Huang <yong.huang@smartx.com> --- migration/ram.c | 55 ++++++++++++++++++++++++++++++++++-- migration/trace-events | 1 + tests/qtest/migration-test.c | 1 + 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index XXXXXXX..XXXXXXX 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -XXX,XX +XXX,XX @@ struct RAMState { * RAM migration. */ unsigned int postcopy_bmap_sync_requested; + + /* + * Ratio of bytes_dirty_period and bytes_xfer_period in the + * previous sync. + */ + uint64_t dirty_ratio_pct; }; typedef struct RAMState RAMState; @@ -XXX,XX +XXX,XX @@ static void migration_dirty_limit_guest(void) trace_migration_dirty_limit_guest(quota_dirtyrate); } +static bool migration_dirty_ratio_high(RAMState *rs) +{ + static int dirty_ratio_high_cnt; + uint64_t threshold = migrate_throttle_trigger_threshold(); + uint64_t bytes_xfer_period = + migration_transferred_bytes() - rs->bytes_xfer_prev; + uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; + bool dirty_ratio_high = false; + uint64_t prev, curr; + + /* Calculate the dirty ratio percentage */ + curr = 100 * (bytes_dirty_period * 1.0 / bytes_xfer_period); + + prev = rs->dirty_ratio_pct; + rs->dirty_ratio_pct = curr; + + if (prev == 0) { + return false; + } + + /* + * If current dirty ratio is greater than previouse, determine + * that the migration do not converge. + */ + if (curr > threshold && curr >= prev) { + trace_migration_dirty_ratio_high(curr, prev); + dirty_ratio_high_cnt++; + } + + if (dirty_ratio_high_cnt >= 2) { + dirty_ratio_high = true; + dirty_ratio_high_cnt = 0; + } + + return dirty_ratio_high; +} + static void migration_trigger_throttle(RAMState *rs) { uint64_t threshold = migrate_throttle_trigger_threshold(); @@ -XXX,XX +XXX,XX @@ static void migration_trigger_throttle(RAMState *rs) migration_transferred_bytes() - rs->bytes_xfer_prev; uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; + bool dirty_ratio_high = false; + + if (migrate_cpu_throttle_responsive() && (bytes_xfer_period != 0)) { + dirty_ratio_high = migration_dirty_ratio_high(rs); + } /* * The following detection logic can be refined later. For now: @@ -XXX,XX +XXX,XX @@ static void migration_trigger_throttle(RAMState *rs) * twice, start or increase throttling. */ if ((bytes_dirty_period > bytes_dirty_threshold) && - (++rs->dirty_rate_high_cnt >= 2)) { - rs->dirty_rate_high_cnt = 0; + ((++rs->dirty_rate_high_cnt >= 2) || dirty_ratio_high)) { + + rs->dirty_rate_high_cnt = + rs->dirty_rate_high_cnt >= 2 ? 0 : rs->dirty_rate_high_cnt; + if (migrate_auto_converge()) { trace_migration_throttle(); mig_throttle_guest_down(bytes_dirty_period, diff --git a/migration/trace-events b/migration/trace-events index XXXXXXX..XXXXXXX 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -XXX,XX +XXX,XX @@ get_queued_page_not_dirty(const char *block_name, uint64_t tmp_offset, unsigned migration_bitmap_sync_start(void) "" migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64 migration_bitmap_clear_dirty(char *str, uint64_t start, uint64_t size, unsigned long page) "rb %s start 0x%"PRIx64" size 0x%"PRIx64" page 0x%lx" +migration_dirty_ratio_high(uint64_t cur, uint64_t prev) "current ratio: %" PRIu64 " previous ratio: %" PRIu64 migration_throttle(void) "" migration_dirty_limit_guest(int64_t dirtyrate) "guest dirty page rate limit %" PRIi64 " MB/s" ram_discard_range(const char *rbname, uint64_t start, size_t len) "%s: start: %" PRIx64 " %zx" diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index XXXXXXX..XXXXXXX 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -XXX,XX +XXX,XX @@ static void test_migrate_auto_converge(void) migrate_set_parameter_int(from, "cpu-throttle-initial", init_pct); migrate_set_parameter_int(from, "cpu-throttle-increment", inc_pct); migrate_set_parameter_int(from, "max-cpu-throttle", max_pct); + migrate_set_parameter_bool(from, "cpu-throttle-responsive", true); /* * Set the initial parameters so that the migration could not converge -- 2.27.0