From: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
abstract out dirty log change logic into function
global_dirty_log_change.
abstract out dirty page rate calculation logic via
dirty-ring into function vcpu_calculate_dirtyrate.
abstract out mathematical dirty page rate calculation
into do_calculate_dirtyrate, decouple it from DirtyStat.
rename set_sample_page_period to dirty_stat_wait, which
is well-understood and will be reused in dirtylimit.
add cpu_list_lock to protect cpu list before walking
through it in case of race against cpu hotplug/unplug.
export util functions outside migration.
Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
---
include/sysemu/dirtyrate.h | 29 ++++++
migration/dirtyrate.c | 220 ++++++++++++++++++++++++++++-----------------
migration/dirtyrate.h | 7 +-
3 files changed, 171 insertions(+), 85 deletions(-)
create mode 100644 include/sysemu/dirtyrate.h
diff --git a/include/sysemu/dirtyrate.h b/include/sysemu/dirtyrate.h
new file mode 100644
index 0000000..cb6f02b
--- /dev/null
+++ b/include/sysemu/dirtyrate.h
@@ -0,0 +1,29 @@
+/*
+ * dirty page rate helper functions
+ *
+ * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
+ *
+ * Authors:
+ * Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_DIRTYRATE_H
+#define QEMU_DIRTYRATE_H
+
+typedef struct VcpuStat {
+ int nvcpu; /* number of vcpu */
+ DirtyRateVcpu *rates; /* array of dirty rate for each vcpu */
+} VcpuStat;
+
+int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
+ int64_t init_time_ms,
+ VcpuStat *stat,
+ unsigned int flag,
+ bool one_shot);
+
+void global_dirty_log_change(unsigned int flag,
+ bool start);
+#endif
diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index d65e744..1407455 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -46,7 +46,7 @@ static struct DirtyRateStat DirtyStat;
static DirtyRateMeasureMode dirtyrate_mode =
DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
-static int64_t set_sample_page_period(int64_t msec, int64_t initial_time)
+static int64_t dirty_stat_wait(int64_t msec, int64_t initial_time)
{
int64_t current_time;
@@ -60,6 +60,128 @@ static int64_t set_sample_page_period(int64_t msec, int64_t initial_time)
return msec;
}
+static inline void record_dirtypages(DirtyPageRecord *dirty_pages,
+ CPUState *cpu, bool start)
+{
+ if (start) {
+ dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
+ } else {
+ dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
+ }
+}
+
+static int64_t do_calculate_dirtyrate(DirtyPageRecord dirty_pages,
+ int64_t calc_time_ms)
+{
+ uint64_t memory_size_MB;
+ uint64_t increased_dirty_pages =
+ dirty_pages.end_pages - dirty_pages.start_pages;
+
+ memory_size_MB = (increased_dirty_pages * TARGET_PAGE_SIZE) >> 20;
+
+ return memory_size_MB * 1000 / calc_time_ms;
+}
+
+void global_dirty_log_change(unsigned int flag, bool start)
+{
+ qemu_mutex_lock_iothread();
+ if (start) {
+ memory_global_dirty_log_start(flag);
+ } else {
+ memory_global_dirty_log_stop(flag);
+ }
+ qemu_mutex_unlock_iothread();
+}
+
+/*
+ * global_dirty_log_sync
+ * 1. sync dirty log from kvm
+ * 2. stop dirty tracking if needed.
+ */
+static void global_dirty_log_sync(unsigned int flag, bool one_shot)
+{
+ qemu_mutex_lock_iothread();
+ memory_global_dirty_log_sync();
+ if (one_shot) {
+ memory_global_dirty_log_stop(flag);
+ }
+ qemu_mutex_unlock_iothread();
+}
+
+static DirtyPageRecord *vcpu_dirty_stat_alloc(VcpuStat *stat)
+{
+ CPUState *cpu;
+ DirtyPageRecord *records;
+ int nvcpu = 0;
+
+ CPU_FOREACH(cpu) {
+ nvcpu++;
+ }
+
+ stat->nvcpu = nvcpu;
+ stat->rates = g_malloc0(sizeof(DirtyRateVcpu) * nvcpu);
+
+ records = g_malloc0(sizeof(DirtyPageRecord) * nvcpu);
+
+ return records;
+}
+
+static void vcpu_dirty_stat_collect(VcpuStat *stat,
+ DirtyPageRecord *records,
+ bool start)
+{
+ CPUState *cpu;
+
+ CPU_FOREACH(cpu) {
+ if (!start && cpu->cpu_index >= stat->nvcpu) {
+ /*
+ * Never go there unless cpu is hot-plugged,
+ * just ignore in this case.
+ */
+ continue;
+ }
+ record_dirtypages(records, cpu, start);
+ }
+}
+
+int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
+ int64_t init_time_ms,
+ VcpuStat *stat,
+ unsigned int flag,
+ bool one_shot)
+{
+ DirtyPageRecord *records;
+ int64_t duration;
+ int64_t dirtyrate;
+ int i = 0;
+
+ cpu_list_lock();
+ records = vcpu_dirty_stat_alloc(stat);
+ vcpu_dirty_stat_collect(stat, records, true);
+ cpu_list_unlock();
+
+ duration = dirty_stat_wait(calc_time_ms, init_time_ms);
+
+ global_dirty_log_sync(flag, one_shot);
+
+ cpu_list_lock();
+ vcpu_dirty_stat_collect(stat, records, false);
+ cpu_list_unlock();
+
+ for (i = 0; i < stat->nvcpu; i++) {
+ dirtyrate = do_calculate_dirtyrate(records[i], duration);
+
+ stat->rates[i].id = i;
+ stat->rates[i].dirty_rate = dirtyrate;
+
+ trace_dirtyrate_do_calculate_vcpu(i, dirtyrate);
+ }
+
+ g_free(records);
+
+ return duration;
+}
+
static bool is_sample_period_valid(int64_t sec)
{
if (sec < MIN_FETCH_DIRTYRATE_TIME_SEC ||
@@ -396,44 +518,6 @@ static bool compare_page_hash_info(struct RamblockDirtyInfo *info,
return true;
}
-static inline void record_dirtypages(DirtyPageRecord *dirty_pages,
- CPUState *cpu, bool start)
-{
- if (start) {
- dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
- } else {
- dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
- }
-}
-
-static void dirtyrate_global_dirty_log_start(void)
-{
- qemu_mutex_lock_iothread();
- memory_global_dirty_log_start(GLOBAL_DIRTY_DIRTY_RATE);
- qemu_mutex_unlock_iothread();
-}
-
-static void dirtyrate_global_dirty_log_stop(void)
-{
- qemu_mutex_lock_iothread();
- memory_global_dirty_log_sync();
- memory_global_dirty_log_stop(GLOBAL_DIRTY_DIRTY_RATE);
- qemu_mutex_unlock_iothread();
-}
-
-static int64_t do_calculate_dirtyrate_vcpu(DirtyPageRecord dirty_pages)
-{
- uint64_t memory_size_MB;
- int64_t time_s;
- uint64_t increased_dirty_pages =
- dirty_pages.end_pages - dirty_pages.start_pages;
-
- memory_size_MB = (increased_dirty_pages * TARGET_PAGE_SIZE) >> 20;
- time_s = DirtyStat.calc_time;
-
- return memory_size_MB / time_s;
-}
-
static inline void record_dirtypages_bitmap(DirtyPageRecord *dirty_pages,
bool start)
{
@@ -444,11 +528,6 @@ static inline void record_dirtypages_bitmap(DirtyPageRecord *dirty_pages,
}
}
-static void do_calculate_dirtyrate_bitmap(DirtyPageRecord dirty_pages)
-{
- DirtyStat.dirty_rate = do_calculate_dirtyrate_vcpu(dirty_pages);
-}
-
static inline void dirtyrate_manual_reset_protect(void)
{
RAMBlock *block = NULL;
@@ -492,71 +571,52 @@ static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config)
DirtyStat.start_time = start_time / 1000;
msec = config.sample_period_seconds * 1000;
- msec = set_sample_page_period(msec, start_time);
+ msec = dirty_stat_wait(msec, start_time);
DirtyStat.calc_time = msec / 1000;
/*
- * dirtyrate_global_dirty_log_stop do two things.
+ * do two things.
* 1. fetch dirty bitmap from kvm
* 2. stop dirty tracking
*/
- dirtyrate_global_dirty_log_stop();
+ global_dirty_log_sync(GLOBAL_DIRTY_DIRTY_RATE, true);
record_dirtypages_bitmap(&dirty_pages, false);
- do_calculate_dirtyrate_bitmap(dirty_pages);
+ DirtyStat.dirty_rate = do_calculate_dirtyrate(dirty_pages, msec);
}
static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config)
{
- CPUState *cpu;
- int64_t msec = 0;
int64_t start_time;
+ int64_t duration;
uint64_t dirtyrate = 0;
uint64_t dirtyrate_sum = 0;
- DirtyPageRecord *dirty_pages;
- int nvcpu = 0;
int i = 0;
- CPU_FOREACH(cpu) {
- nvcpu++;
- }
-
- dirty_pages = malloc(sizeof(*dirty_pages) * nvcpu);
-
- DirtyStat.dirty_ring.nvcpu = nvcpu;
- DirtyStat.dirty_ring.rates = malloc(sizeof(DirtyRateVcpu) * nvcpu);
-
- dirtyrate_global_dirty_log_start();
-
- CPU_FOREACH(cpu) {
- record_dirtypages(dirty_pages, cpu, true);
- }
+ /* start log sync */
+ global_dirty_log_change(GLOBAL_DIRTY_DIRTY_RATE, true);
start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
DirtyStat.start_time = start_time / 1000;
- msec = config.sample_period_seconds * 1000;
- msec = set_sample_page_period(msec, start_time);
- DirtyStat.calc_time = msec / 1000;
+ /* calculate vcpu dirtyrate */
+ duration = vcpu_calculate_dirtyrate(config.sample_period_seconds * 1000,
+ start_time,
+ &DirtyStat.dirty_ring,
+ GLOBAL_DIRTY_DIRTY_RATE,
+ true);
- dirtyrate_global_dirty_log_stop();
-
- CPU_FOREACH(cpu) {
- record_dirtypages(dirty_pages, cpu, false);
- }
+ DirtyStat.calc_time = duration / 1000;
+ /* calculate vm dirtyrate */
for (i = 0; i < DirtyStat.dirty_ring.nvcpu; i++) {
- dirtyrate = do_calculate_dirtyrate_vcpu(dirty_pages[i]);
- trace_dirtyrate_do_calculate_vcpu(i, dirtyrate);
-
- DirtyStat.dirty_ring.rates[i].id = i;
+ dirtyrate = DirtyStat.dirty_ring.rates[i].dirty_rate;
DirtyStat.dirty_ring.rates[i].dirty_rate = dirtyrate;
dirtyrate_sum += dirtyrate;
}
DirtyStat.dirty_rate = dirtyrate_sum;
- free(dirty_pages);
}
static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
@@ -574,7 +634,7 @@ static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
rcu_read_unlock();
msec = config.sample_period_seconds * 1000;
- msec = set_sample_page_period(msec, initial_time);
+ msec = dirty_stat_wait(msec, initial_time);
DirtyStat.start_time = initial_time / 1000;
DirtyStat.calc_time = msec / 1000;
diff --git a/migration/dirtyrate.h b/migration/dirtyrate.h
index 69d4c5b..594a5c0 100644
--- a/migration/dirtyrate.h
+++ b/migration/dirtyrate.h
@@ -13,6 +13,8 @@
#ifndef QEMU_MIGRATION_DIRTYRATE_H
#define QEMU_MIGRATION_DIRTYRATE_H
+#include "sysemu/dirtyrate.h"
+
/*
* Sample 512 pages per GB as default.
*/
@@ -65,11 +67,6 @@ typedef struct SampleVMStat {
uint64_t total_block_mem_MB; /* size of total sampled pages in MB */
} SampleVMStat;
-typedef struct VcpuStat {
- int nvcpu; /* number of vcpu */
- DirtyRateVcpu *rates; /* array of dirty rate for each vcpu */
-} VcpuStat;
-
/*
* Store calculation statistics for each measure.
*/
--
1.8.3.1
On Wed, Jan 05, 2022 at 01:14:06AM +0800, huangy81@chinatelecom.cn wrote:
> From: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
>
> abstract out dirty log change logic into function
> global_dirty_log_change.
>
> abstract out dirty page rate calculation logic via
> dirty-ring into function vcpu_calculate_dirtyrate.
>
> abstract out mathematical dirty page rate calculation
> into do_calculate_dirtyrate, decouple it from DirtyStat.
>
> rename set_sample_page_period to dirty_stat_wait, which
> is well-understood and will be reused in dirtylimit.
>
> add cpu_list_lock to protect cpu list before walking
> through it in case of race against cpu hotplug/unplug.
>
> export util functions outside migration.
>
> Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
> ---
> include/sysemu/dirtyrate.h | 29 ++++++
> migration/dirtyrate.c | 220 ++++++++++++++++++++++++++++-----------------
> migration/dirtyrate.h | 7 +-
> 3 files changed, 171 insertions(+), 85 deletions(-)
> create mode 100644 include/sysemu/dirtyrate.h
>
> diff --git a/include/sysemu/dirtyrate.h b/include/sysemu/dirtyrate.h
> new file mode 100644
> index 0000000..cb6f02b
> --- /dev/null
> +++ b/include/sysemu/dirtyrate.h
> @@ -0,0 +1,29 @@
> +/*
> + * dirty page rate helper functions
> + *
> + * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
> + *
> + * Authors:
> + * Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#ifndef QEMU_DIRTYRATE_H
> +#define QEMU_DIRTYRATE_H
> +
> +typedef struct VcpuStat {
> + int nvcpu; /* number of vcpu */
> + DirtyRateVcpu *rates; /* array of dirty rate for each vcpu */
> +} VcpuStat;
> +
> +int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
> + int64_t init_time_ms,
> + VcpuStat *stat,
> + unsigned int flag,
> + bool one_shot);
> +
> +void global_dirty_log_change(unsigned int flag,
> + bool start);
> +#endif
> diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
> index d65e744..1407455 100644
> --- a/migration/dirtyrate.c
> +++ b/migration/dirtyrate.c
> @@ -46,7 +46,7 @@ static struct DirtyRateStat DirtyStat;
> static DirtyRateMeasureMode dirtyrate_mode =
> DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
>
> -static int64_t set_sample_page_period(int64_t msec, int64_t initial_time)
> +static int64_t dirty_stat_wait(int64_t msec, int64_t initial_time)
> {
> int64_t current_time;
>
> @@ -60,6 +60,128 @@ static int64_t set_sample_page_period(int64_t msec, int64_t initial_time)
> return msec;
> }
>
> +static inline void record_dirtypages(DirtyPageRecord *dirty_pages,
> + CPUState *cpu, bool start)
> +{
> + if (start) {
> + dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
> + } else {
> + dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
> + }
> +}
> +
> +static int64_t do_calculate_dirtyrate(DirtyPageRecord dirty_pages,
> + int64_t calc_time_ms)
> +{
> + uint64_t memory_size_MB;
> + uint64_t increased_dirty_pages =
> + dirty_pages.end_pages - dirty_pages.start_pages;
> +
> + memory_size_MB = (increased_dirty_pages * TARGET_PAGE_SIZE) >> 20;
> +
> + return memory_size_MB * 1000 / calc_time_ms;
> +}
> +
> +void global_dirty_log_change(unsigned int flag, bool start)
> +{
> + qemu_mutex_lock_iothread();
> + if (start) {
> + memory_global_dirty_log_start(flag);
> + } else {
> + memory_global_dirty_log_stop(flag);
> + }
> + qemu_mutex_unlock_iothread();
> +}
> +
> +/*
> + * global_dirty_log_sync
> + * 1. sync dirty log from kvm
> + * 2. stop dirty tracking if needed.
> + */
> +static void global_dirty_log_sync(unsigned int flag, bool one_shot)
> +{
> + qemu_mutex_lock_iothread();
> + memory_global_dirty_log_sync();
> + if (one_shot) {
> + memory_global_dirty_log_stop(flag);
> + }
> + qemu_mutex_unlock_iothread();
> +}
> +
> +static DirtyPageRecord *vcpu_dirty_stat_alloc(VcpuStat *stat)
> +{
> + CPUState *cpu;
> + DirtyPageRecord *records;
> + int nvcpu = 0;
> +
> + CPU_FOREACH(cpu) {
> + nvcpu++;
> + }
> +
> + stat->nvcpu = nvcpu;
> + stat->rates = g_malloc0(sizeof(DirtyRateVcpu) * nvcpu);
> +
> + records = g_malloc0(sizeof(DirtyPageRecord) * nvcpu);
> +
> + return records;
> +}
> +
> +static void vcpu_dirty_stat_collect(VcpuStat *stat,
> + DirtyPageRecord *records,
> + bool start)
> +{
> + CPUState *cpu;
> +
> + CPU_FOREACH(cpu) {
> + if (!start && cpu->cpu_index >= stat->nvcpu) {
> + /*
> + * Never go there unless cpu is hot-plugged,
> + * just ignore in this case.
> + */
> + continue;
> + }
As commented before, I think the only way to do it right is does not allow cpu
plug/unplug during measurement..
Say, even if index didn't get out of range, an unplug even should generate very
stange output of the unplugged cpu. Please see more below.
> + record_dirtypages(records, cpu, start);
> + }
> +}
> +
> +int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
> + int64_t init_time_ms,
> + VcpuStat *stat,
> + unsigned int flag,
> + bool one_shot)
> +{
> + DirtyPageRecord *records;
> + int64_t duration;
> + int64_t dirtyrate;
> + int i = 0;
> +
> + cpu_list_lock();
> + records = vcpu_dirty_stat_alloc(stat);
> + vcpu_dirty_stat_collect(stat, records, true);
> + cpu_list_unlock();
Continue with above - then I'm wondering whether we should just keep taking the
lock until vcpu_dirty_stat_collect().
Yes we could be taking the lock for a long time because of the sleep, but the
main thread plug thread will just wait for it to complete and it is at least
not a e.g. deadlock.
The other solution is we do cpu_list_unlock() like this, but introduce another
cpu_list_generation_id and boost it after any plug/unplug of cpu, aka, when cpu
list changes.
Then we record cpu generation ID at the entry of this function and retry the
whole measurement if at some point we found generation ID changed (we need to
fetch the gen ID after having the lock, of course). That could avoid us taking
the cpu list lock during dirty_stat_wait(), but it'll start to complicate cpu
list locking rules.
The simpler way is still just to take the lock, imho.
The rest looks good, thanks.
> +
> + duration = dirty_stat_wait(calc_time_ms, init_time_ms);
> +
> + global_dirty_log_sync(flag, one_shot);
> +
> + cpu_list_lock();
> + vcpu_dirty_stat_collect(stat, records, false);
> + cpu_list_unlock();
> +
> + for (i = 0; i < stat->nvcpu; i++) {
> + dirtyrate = do_calculate_dirtyrate(records[i], duration);
> +
> + stat->rates[i].id = i;
> + stat->rates[i].dirty_rate = dirtyrate;
> +
> + trace_dirtyrate_do_calculate_vcpu(i, dirtyrate);
> + }
> +
> + g_free(records);
> +
> + return duration;
> +}
> +
> static bool is_sample_period_valid(int64_t sec)
> {
> if (sec < MIN_FETCH_DIRTYRATE_TIME_SEC ||
> @@ -396,44 +518,6 @@ static bool compare_page_hash_info(struct RamblockDirtyInfo *info,
> return true;
> }
>
> -static inline void record_dirtypages(DirtyPageRecord *dirty_pages,
> - CPUState *cpu, bool start)
> -{
> - if (start) {
> - dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
> - } else {
> - dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
> - }
> -}
> -
> -static void dirtyrate_global_dirty_log_start(void)
> -{
> - qemu_mutex_lock_iothread();
> - memory_global_dirty_log_start(GLOBAL_DIRTY_DIRTY_RATE);
> - qemu_mutex_unlock_iothread();
> -}
> -
> -static void dirtyrate_global_dirty_log_stop(void)
> -{
> - qemu_mutex_lock_iothread();
> - memory_global_dirty_log_sync();
> - memory_global_dirty_log_stop(GLOBAL_DIRTY_DIRTY_RATE);
> - qemu_mutex_unlock_iothread();
> -}
> -
> -static int64_t do_calculate_dirtyrate_vcpu(DirtyPageRecord dirty_pages)
> -{
> - uint64_t memory_size_MB;
> - int64_t time_s;
> - uint64_t increased_dirty_pages =
> - dirty_pages.end_pages - dirty_pages.start_pages;
> -
> - memory_size_MB = (increased_dirty_pages * TARGET_PAGE_SIZE) >> 20;
> - time_s = DirtyStat.calc_time;
> -
> - return memory_size_MB / time_s;
> -}
> -
> static inline void record_dirtypages_bitmap(DirtyPageRecord *dirty_pages,
> bool start)
> {
> @@ -444,11 +528,6 @@ static inline void record_dirtypages_bitmap(DirtyPageRecord *dirty_pages,
> }
> }
>
> -static void do_calculate_dirtyrate_bitmap(DirtyPageRecord dirty_pages)
> -{
> - DirtyStat.dirty_rate = do_calculate_dirtyrate_vcpu(dirty_pages);
> -}
> -
> static inline void dirtyrate_manual_reset_protect(void)
> {
> RAMBlock *block = NULL;
> @@ -492,71 +571,52 @@ static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config)
> DirtyStat.start_time = start_time / 1000;
>
> msec = config.sample_period_seconds * 1000;
> - msec = set_sample_page_period(msec, start_time);
> + msec = dirty_stat_wait(msec, start_time);
> DirtyStat.calc_time = msec / 1000;
>
> /*
> - * dirtyrate_global_dirty_log_stop do two things.
> + * do two things.
> * 1. fetch dirty bitmap from kvm
> * 2. stop dirty tracking
> */
> - dirtyrate_global_dirty_log_stop();
> + global_dirty_log_sync(GLOBAL_DIRTY_DIRTY_RATE, true);
>
> record_dirtypages_bitmap(&dirty_pages, false);
>
> - do_calculate_dirtyrate_bitmap(dirty_pages);
> + DirtyStat.dirty_rate = do_calculate_dirtyrate(dirty_pages, msec);
> }
>
> static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config)
> {
> - CPUState *cpu;
> - int64_t msec = 0;
> int64_t start_time;
> + int64_t duration;
> uint64_t dirtyrate = 0;
> uint64_t dirtyrate_sum = 0;
> - DirtyPageRecord *dirty_pages;
> - int nvcpu = 0;
> int i = 0;
>
> - CPU_FOREACH(cpu) {
> - nvcpu++;
> - }
> -
> - dirty_pages = malloc(sizeof(*dirty_pages) * nvcpu);
> -
> - DirtyStat.dirty_ring.nvcpu = nvcpu;
> - DirtyStat.dirty_ring.rates = malloc(sizeof(DirtyRateVcpu) * nvcpu);
> -
> - dirtyrate_global_dirty_log_start();
> -
> - CPU_FOREACH(cpu) {
> - record_dirtypages(dirty_pages, cpu, true);
> - }
> + /* start log sync */
> + global_dirty_log_change(GLOBAL_DIRTY_DIRTY_RATE, true);
>
> start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> DirtyStat.start_time = start_time / 1000;
>
> - msec = config.sample_period_seconds * 1000;
> - msec = set_sample_page_period(msec, start_time);
> - DirtyStat.calc_time = msec / 1000;
> + /* calculate vcpu dirtyrate */
> + duration = vcpu_calculate_dirtyrate(config.sample_period_seconds * 1000,
> + start_time,
> + &DirtyStat.dirty_ring,
> + GLOBAL_DIRTY_DIRTY_RATE,
> + true);
>
> - dirtyrate_global_dirty_log_stop();
> -
> - CPU_FOREACH(cpu) {
> - record_dirtypages(dirty_pages, cpu, false);
> - }
> + DirtyStat.calc_time = duration / 1000;
>
> + /* calculate vm dirtyrate */
> for (i = 0; i < DirtyStat.dirty_ring.nvcpu; i++) {
> - dirtyrate = do_calculate_dirtyrate_vcpu(dirty_pages[i]);
> - trace_dirtyrate_do_calculate_vcpu(i, dirtyrate);
> -
> - DirtyStat.dirty_ring.rates[i].id = i;
> + dirtyrate = DirtyStat.dirty_ring.rates[i].dirty_rate;
> DirtyStat.dirty_ring.rates[i].dirty_rate = dirtyrate;
> dirtyrate_sum += dirtyrate;
> }
>
> DirtyStat.dirty_rate = dirtyrate_sum;
> - free(dirty_pages);
> }
>
> static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
> @@ -574,7 +634,7 @@ static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
> rcu_read_unlock();
>
> msec = config.sample_period_seconds * 1000;
> - msec = set_sample_page_period(msec, initial_time);
> + msec = dirty_stat_wait(msec, initial_time);
> DirtyStat.start_time = initial_time / 1000;
> DirtyStat.calc_time = msec / 1000;
>
> diff --git a/migration/dirtyrate.h b/migration/dirtyrate.h
> index 69d4c5b..594a5c0 100644
> --- a/migration/dirtyrate.h
> +++ b/migration/dirtyrate.h
> @@ -13,6 +13,8 @@
> #ifndef QEMU_MIGRATION_DIRTYRATE_H
> #define QEMU_MIGRATION_DIRTYRATE_H
>
> +#include "sysemu/dirtyrate.h"
> +
> /*
> * Sample 512 pages per GB as default.
> */
> @@ -65,11 +67,6 @@ typedef struct SampleVMStat {
> uint64_t total_block_mem_MB; /* size of total sampled pages in MB */
> } SampleVMStat;
>
> -typedef struct VcpuStat {
> - int nvcpu; /* number of vcpu */
> - DirtyRateVcpu *rates; /* array of dirty rate for each vcpu */
> -} VcpuStat;
> -
> /*
> * Store calculation statistics for each measure.
> */
> --
> 1.8.3.1
>
--
Peter Xu
在 2022/1/17 10:19, Peter Xu 写道:
> On Wed, Jan 05, 2022 at 01:14:06AM +0800, huangy81@chinatelecom.cn wrote:
>> From: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
>>
>> +
>> +static void vcpu_dirty_stat_collect(VcpuStat *stat,
>> + DirtyPageRecord *records,
>> + bool start)
>> +{
>> + CPUState *cpu;
>> +
>> + CPU_FOREACH(cpu) {
>> + if (!start && cpu->cpu_index >= stat->nvcpu) {
>> + /*
>> + * Never go there unless cpu is hot-plugged,
>> + * just ignore in this case.
>> + */
>> + continue;
>> + }
>
> As commented before, I think the only way to do it right is does not allow cpu
> plug/unplug during measurement..
>
> Say, even if index didn't get out of range, an unplug even should generate very
> stange output of the unplugged cpu. Please see more below.
>
>> + record_dirtypages(records, cpu, start);
>> + }
>> +}
>> +
>> +int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
>> + int64_t init_time_ms,
>> + VcpuStat *stat,
>> + unsigned int flag,
>> + bool one_shot)
>> +{
>> + DirtyPageRecord *records;
>> + int64_t duration;
>> + int64_t dirtyrate;
>> + int i = 0;
>> +
>> + cpu_list_lock();
>> + records = vcpu_dirty_stat_alloc(stat);
>> + vcpu_dirty_stat_collect(stat, records, true);
>> + cpu_list_unlock();
>
> Continue with above - then I'm wondering whether we should just keep taking the
> lock until vcpu_dirty_stat_collect().
>
> Yes we could be taking the lock for a long time because of the sleep, but the
> main thread plug thread will just wait for it to complete and it is at least
> not a e.g. deadlock.
>
> The other solution is we do cpu_list_unlock() like this, but introduce another
> cpu_list_generation_id and boost it after any plug/unplug of cpu, aka, when cpu
> list changes.
>
> Then we record cpu generation ID at the entry of this function and retry the
> whole measurement if at some point we found generation ID changed (we need to
> fetch the gen ID after having the lock, of course). That could avoid us taking
> the cpu list lock during dirty_stat_wait(), but it'll start to complicate cpu
> list locking rules.
>
> The simpler way is still just to take the lock, imho.
>
Hi, Peter, i'm working on this as you suggetion, and keep taking the
cpu_list_lock during dirty page rate calculation. I found the deadlock
when testing hotplug scenario, the logic is as the following:
calc thread qemu main thread
1. take qemu_cpu_list_lock
1. take the BQL
2. collect dirty page and wait 2. cpu hotplug
3. take qemu_cpu_list_lock
3. take the BQL
4. sync dirty log
5. release the BQL
I just recall that is one of the reasons why i handle the plug/unplug
scenario(another is cpu plug may wait a little bit long time when
dirtylimit in service).
It seems that we have two strategies, one is just keep this logic
untouched in v12 and add "cpu_list_generation_id" implementaion in TODO
list(once this patchset been merged, i'll try that out), another is
introducing the "cpu_list_generation_id" right now.
What strategy do you prefer to?
Uh... I think the "unmatched_cnt" also kind of like this too, becauce
once we remove the "unmatched count" logic, the throttle algo is more
likely to oscillate and i prefer to add the "unmatched_cnt" in TODO list
as above.
> The rest looks good, thanks.
>
>> +
>> + duration = dirty_stat_wait(calc_time_ms, init_time_ms);
>> +
>> + global_dirty_log_sync(flag, one_shot);
>> +
>> + cpu_list_lock();
>> + vcpu_dirty_stat_collect(stat, records, false);
>> + cpu_list_unlock();
>> +
>> + for (i = 0; i < stat->nvcpu; i++) {
>> + dirtyrate = do_calculate_dirtyrate(records[i], duration);
>> +
>> + stat->rates[i].id = i;
>> + stat->rates[i].dirty_rate = dirtyrate;
>> +
>> + trace_dirtyrate_do_calculate_vcpu(i, dirtyrate);
>> + }
>> +
>> + g_free(records);
>> +
>> + return duration;
>> +}
>> +
>> static bool is_sample_period_valid(int64_t sec)
>> {
>> if (sec < MIN_FETCH_DIRTYRATE_TIME_SEC ||
>> @@ -396,44 +518,6 @@ static bool compare_page_hash_info(struct RamblockDirtyInfo *info,
>> return true;
>> }
>>
>> -static inline void record_dirtypages(DirtyPageRecord *dirty_pages,
>> - CPUState *cpu, bool start)
>> -{
>> - if (start) {
>> - dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
>> - } else {
>> - dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
>> - }
>> -}
>> -
>> -static void dirtyrate_global_dirty_log_start(void)
>> -{
>> - qemu_mutex_lock_iothread();
>> - memory_global_dirty_log_start(GLOBAL_DIRTY_DIRTY_RATE);
>> - qemu_mutex_unlock_iothread();
>> -}
>> -
>> -static void dirtyrate_global_dirty_log_stop(void)
>> -{
>> - qemu_mutex_lock_iothread();
>> - memory_global_dirty_log_sync();
>> - memory_global_dirty_log_stop(GLOBAL_DIRTY_DIRTY_RATE);
>> - qemu_mutex_unlock_iothread();
>> -}
>> -
>> -static int64_t do_calculate_dirtyrate_vcpu(DirtyPageRecord dirty_pages)
>> -{
>> - uint64_t memory_size_MB;
>> - int64_t time_s;
>> - uint64_t increased_dirty_pages =
>> - dirty_pages.end_pages - dirty_pages.start_pages;
>> -
>> - memory_size_MB = (increased_dirty_pages * TARGET_PAGE_SIZE) >> 20;
>> - time_s = DirtyStat.calc_time;
>> -
>> - return memory_size_MB / time_s;
>> -}
>> -
>> static inline void record_dirtypages_bitmap(DirtyPageRecord *dirty_pages,
>> bool start)
>> {
>> @@ -444,11 +528,6 @@ static inline void record_dirtypages_bitmap(DirtyPageRecord *dirty_pages,
>> }
>> }
>>
>> -static void do_calculate_dirtyrate_bitmap(DirtyPageRecord dirty_pages)
>> -{
>> - DirtyStat.dirty_rate = do_calculate_dirtyrate_vcpu(dirty_pages);
>> -}
>> -
>> static inline void dirtyrate_manual_reset_protect(void)
>> {
>> RAMBlock *block = NULL;
>> @@ -492,71 +571,52 @@ static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config)
>> DirtyStat.start_time = start_time / 1000;
>>
>> msec = config.sample_period_seconds * 1000;
>> - msec = set_sample_page_period(msec, start_time);
>> + msec = dirty_stat_wait(msec, start_time);
>> DirtyStat.calc_time = msec / 1000;
>>
>> /*
>> - * dirtyrate_global_dirty_log_stop do two things.
>> + * do two things.
>> * 1. fetch dirty bitmap from kvm
>> * 2. stop dirty tracking
>> */
>> - dirtyrate_global_dirty_log_stop();
>> + global_dirty_log_sync(GLOBAL_DIRTY_DIRTY_RATE, true);
>>
>> record_dirtypages_bitmap(&dirty_pages, false);
>>
>> - do_calculate_dirtyrate_bitmap(dirty_pages);
>> + DirtyStat.dirty_rate = do_calculate_dirtyrate(dirty_pages, msec);
>> }
>>
--
Best regard
Hyman Huang(黄勇)
On Sat, Jan 22, 2022 at 11:22:37AM +0800, Hyman Huang wrote:
>
>
> 在 2022/1/17 10:19, Peter Xu 写道:
> > On Wed, Jan 05, 2022 at 01:14:06AM +0800, huangy81@chinatelecom.cn wrote:
> > > From: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
> > >
> > > +
> > > +static void vcpu_dirty_stat_collect(VcpuStat *stat,
> > > + DirtyPageRecord *records,
> > > + bool start)
> > > +{
> > > + CPUState *cpu;
> > > +
> > > + CPU_FOREACH(cpu) {
> > > + if (!start && cpu->cpu_index >= stat->nvcpu) {
> > > + /*
> > > + * Never go there unless cpu is hot-plugged,
> > > + * just ignore in this case.
> > > + */
> > > + continue;
> > > + }
> >
> > As commented before, I think the only way to do it right is does not allow cpu
> > plug/unplug during measurement..
> >
> > Say, even if index didn't get out of range, an unplug even should generate very
> > stange output of the unplugged cpu. Please see more below.
> >
> > > + record_dirtypages(records, cpu, start);
> > > + }
> > > +}
> > > +
> > > +int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
> > > + int64_t init_time_ms,
> > > + VcpuStat *stat,
> > > + unsigned int flag,
> > > + bool one_shot)
> > > +{
> > > + DirtyPageRecord *records;
> > > + int64_t duration;
> > > + int64_t dirtyrate;
> > > + int i = 0;
> > > +
> > > + cpu_list_lock();
> > > + records = vcpu_dirty_stat_alloc(stat);
> > > + vcpu_dirty_stat_collect(stat, records, true);
> > > + cpu_list_unlock();
> >
> > Continue with above - then I'm wondering whether we should just keep taking the
> > lock until vcpu_dirty_stat_collect().
> >
> > Yes we could be taking the lock for a long time because of the sleep, but the
> > main thread plug thread will just wait for it to complete and it is at least
> > not a e.g. deadlock.
> >
> > The other solution is we do cpu_list_unlock() like this, but introduce another
> > cpu_list_generation_id and boost it after any plug/unplug of cpu, aka, when cpu
> > list changes.
> >
> > Then we record cpu generation ID at the entry of this function and retry the
> > whole measurement if at some point we found generation ID changed (we need to
> > fetch the gen ID after having the lock, of course). That could avoid us taking
> > the cpu list lock during dirty_stat_wait(), but it'll start to complicate cpu
> > list locking rules.
> >
> > The simpler way is still just to take the lock, imho.
> >
> Hi, Peter, i'm working on this as you suggetion, and keep taking the
> cpu_list_lock during dirty page rate calculation. I found the deadlock when
> testing hotplug scenario, the logic is as the following:
>
> calc thread qemu main thread
> 1. take qemu_cpu_list_lock
> 1. take the BQL
> 2. collect dirty page and wait 2. cpu hotplug
> 3. take qemu_cpu_list_lock
> 3. take the BQL
>
> 4. sync dirty log
>
> 5. release the BQL
>
> I just recall that is one of the reasons why i handle the plug/unplug
> scenario(another is cpu plug may wait a little bit long time when dirtylimit
> in service).
Ah I should have noticed the bql dependency with cpu list lock before..
I think having the cpu plug waiting for one sec is fine, because the mgmt app
should be aware of both so it shouldn't even happen in practise (not good
timing to plug during pre-migration). However bql is definitely another
story.. which I agree.
>
> It seems that we have two strategies, one is just keep this logic untouched
> in v12 and add "cpu_list_generation_id" implementaion in TODO list(once this
> patchset been merged, i'll try that out), another is introducing the
> "cpu_list_generation_id" right now.
>
> What strategy do you prefer to?
I prefer having the gen_id patch. The thing is it should be less than 10 lines
and the logic should be fairly straightforward. While if without it, it seems
always on risk to use this new feature.
I hope I didn't overlook any existing mechanism to block cpu plug/unplug for
some period, though, or we should use it.
>
> Uh... I think the "unmatched_cnt" also kind of like this too, becauce once
> we remove the "unmatched count" logic, the throttle algo is more likely to
> oscillate and i prefer to add the "unmatched_cnt" in TODO list as above.
Could we tune the differential factor to make it less possible to oscillate?
I still can't say I like "unmatched cnt" idea a lot.. From a PID pov (partial,
integral, differential) you've already got partial + differential, and IMHO
that "unmatched cnt" solution was trying to mimic an "integral" delta. Instead
of doing an mean value calculation (as in most integral system does) the
"unmatched cnt" solution literally made it an array of 2 and it dropped the 1st
element.. Hence a decision was made only from the 2nd data you collected.
From that POV I think it's cleaner you add a real (but simple) integral algo
into it? It can be e.g. an array of 3, then when you do the math you use the
average of the three dirty rates. Would that work (and also look a bit
cleaner)?
Thanks,
--
Peter Xu
在 2022/1/24 11:08, Peter Xu 写道:
> On Sat, Jan 22, 2022 at 11:22:37AM +0800, Hyman Huang wrote:
>>
>>
>> 在 2022/1/17 10:19, Peter Xu 写道:
>>> On Wed, Jan 05, 2022 at 01:14:06AM +0800, huangy81@chinatelecom.cn wrote:
>>>> From: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
>>>>
>>>> +
>>>> +static void vcpu_dirty_stat_collect(VcpuStat *stat,
>>>> + DirtyPageRecord *records,
>>>> + bool start)
>>>> +{
>>>> + CPUState *cpu;
>>>> +
>>>> + CPU_FOREACH(cpu) {
>>>> + if (!start && cpu->cpu_index >= stat->nvcpu) {
>>>> + /*
>>>> + * Never go there unless cpu is hot-plugged,
>>>> + * just ignore in this case.
>>>> + */
>>>> + continue;
>>>> + }
>>>
>>> As commented before, I think the only way to do it right is does not allow cpu
>>> plug/unplug during measurement..
>>>
>>> Say, even if index didn't get out of range, an unplug even should generate very
>>> stange output of the unplugged cpu. Please see more below.
>>>
>>>> + record_dirtypages(records, cpu, start);
>>>> + }
>>>> +}
>>>> +
>>>> +int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
>>>> + int64_t init_time_ms,
>>>> + VcpuStat *stat,
>>>> + unsigned int flag,
>>>> + bool one_shot)
>>>> +{
>>>> + DirtyPageRecord *records;
>>>> + int64_t duration;
>>>> + int64_t dirtyrate;
>>>> + int i = 0;
>>>> +
>>>> + cpu_list_lock();
>>>> + records = vcpu_dirty_stat_alloc(stat);
>>>> + vcpu_dirty_stat_collect(stat, records, true);
>>>> + cpu_list_unlock();
>>>
>>> Continue with above - then I'm wondering whether we should just keep taking the
>>> lock until vcpu_dirty_stat_collect().
>>>
>>> Yes we could be taking the lock for a long time because of the sleep, but the
>>> main thread plug thread will just wait for it to complete and it is at least
>>> not a e.g. deadlock.
>>>
>>> The other solution is we do cpu_list_unlock() like this, but introduce another
>>> cpu_list_generation_id and boost it after any plug/unplug of cpu, aka, when cpu
>>> list changes.
>>>
>>> Then we record cpu generation ID at the entry of this function and retry the
>>> whole measurement if at some point we found generation ID changed (we need to
>>> fetch the gen ID after having the lock, of course). That could avoid us taking
>>> the cpu list lock during dirty_stat_wait(), but it'll start to complicate cpu
>>> list locking rules.
>>>
>>> The simpler way is still just to take the lock, imho.
>>>
>> Hi, Peter, i'm working on this as you suggetion, and keep taking the
>> cpu_list_lock during dirty page rate calculation. I found the deadlock when
>> testing hotplug scenario, the logic is as the following:
>>
>> calc thread qemu main thread
>> 1. take qemu_cpu_list_lock
>> 1. take the BQL
>> 2. collect dirty page and wait 2. cpu hotplug
>> 3. take qemu_cpu_list_lock
>> 3. take the BQL
>>
>> 4. sync dirty log
>>
>> 5. release the BQL
>>
>> I just recall that is one of the reasons why i handle the plug/unplug
>> scenario(another is cpu plug may wait a little bit long time when dirtylimit
>> in service).
>
> Ah I should have noticed the bql dependency with cpu list lock before..
>
> I think having the cpu plug waiting for one sec is fine, because the mgmt app
> should be aware of both so it shouldn't even happen in practise (not good
> timing to plug during pre-migration). However bql is definitely another
> story.. which I agree.
>
>>
>> It seems that we have two strategies, one is just keep this logic untouched
>> in v12 and add "cpu_list_generation_id" implementaion in TODO list(once this
>> patchset been merged, i'll try that out), another is introducing the
>> "cpu_list_generation_id" right now.
>>
>> What strategy do you prefer to?
>
> I prefer having the gen_id patch. The thing is it should be less than 10 lines
> and the logic should be fairly straightforward. While if without it, it seems
> always on risk to use this new feature.
>
> I hope I didn't overlook any existing mechanism to block cpu plug/unplug for
> some period, though, or we should use it.
>
>>
>> Uh... I think the "unmatched_cnt" also kind of like this too, becauce once
>> we remove the "unmatched count" logic, the throttle algo is more likely to
>> oscillate and i prefer to add the "unmatched_cnt" in TODO list as above.
>
> Could we tune the differential factor to make it less possible to oscillate?
> Uh... From certain angles, yes. When current dirty pate rate is nearly
close to quota when dirtylimit in service, throttle achieve balance.
Once the current dirty page rate arise a slight fluctuation(not much
oscillation), sleep time be adjusted which actually can be ignored.
> I still can't say I like "unmatched cnt" idea a lot.. From a PID pov (partial,
> integral, differential) you've already got partial + differential, and IMHO
> that "unmatched cnt" solution was trying to mimic an "integral" delta. Instead
> of doing an mean value calculation (as in most integral system does) the
> "unmatched cnt" solution literally made it an array of 2 and it dropped the 1st
> element.. Hence a decision was made only from the 2nd data you collected.
>
> From that POV I think it's cleaner you add a real (but simple) integral algo
> into it? It can be e.g. an array of 3, then when you do the math you use the
> average of the three dirty rates. Would that work (and also look a bit
> cleaner)?
Yes, IMHO this is a more complete algo and we can try it out. So, let's
see the v12 test result and decide whether above work should added to
TODO list. :)
>
> Thanks,
>
--
Best regard
Hyman Huang(黄勇)
© 2016 - 2026 Red Hat, Inc.