qapi/migration.json | 15 ++++++-- migration/dirtyrate.h | 12 ++++--- migration/dirtyrate.c | 81 +++++++++++++++++++++++++------------------ 3 files changed, 68 insertions(+), 40 deletions(-)
Introduces alternative argument calc-time-ms, which is the
the same as calc-time but accepts millisecond value.
Millisecond precision allows to make predictions whether
migration will succeed or not. To do this, calculate dirty
rate with calc-time-ms set to max allowed downtime, convert
measured rate into volume of dirtied memory, and divide by
network throughput. If the value is lower than max allowed
downtime, then migration will converge.
Measurement results for single thread randomly writing to
a 24GiB region:
+--------------+--------------------+
| calc-time-ms | dirty-rate (MiB/s) |
+--------------+--------------------+
| 100 | 1880 |
| 200 | 1340 |
| 300 | 1120 |
| 400 | 1030 |
| 500 | 868 |
| 750 | 720 |
| 1000 | 636 |
| 1500 | 498 |
| 2000 | 423 |
+--------------+--------------------+
Signed-off-by: Andrei Gudkov <gudkov.andrei@huawei.com>
---
qapi/migration.json | 15 ++++++--
migration/dirtyrate.h | 12 ++++---
migration/dirtyrate.c | 81 +++++++++++++++++++++++++------------------
3 files changed, 68 insertions(+), 40 deletions(-)
diff --git a/qapi/migration.json b/qapi/migration.json
index 5bb5ab82a0..dd1afe1982 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1778,7 +1778,12 @@
#
# @start-time: start time in units of second for calculation
#
-# @calc-time: time in units of second for sample dirty pages
+# @calc-time: time period for which dirty page rate was measured
+# (rounded down to seconds).
+#
+# @calc-time-ms: actual time period for which dirty page rate was
+# measured (in milliseconds). Value may be larger than requested
+# time period due to measurement overhead.
#
# @sample-pages: page count per GB for sample dirty pages the default
# value is 512 (since 6.1)
@@ -1796,6 +1801,7 @@
'status': 'DirtyRateStatus',
'start-time': 'int64',
'calc-time': 'int64',
+ 'calc-time-ms': 'int64',
'sample-pages': 'uint64',
'mode': 'DirtyRateMeasureMode',
'*vcpu-dirty-rate': [ 'DirtyRateVcpu' ] } }
@@ -1807,6 +1813,10 @@
#
# @calc-time: time in units of second for sample dirty pages
#
+# @calc-time-ms: the same as @calc-time but in milliseconds. These
+# two arguments are mutually exclusive. Exactly one of them must
+# be specified. (Since 8.1)
+#
# @sample-pages: page count per GB for sample dirty pages the default
# value is 512 (since 6.1)
#
@@ -1821,7 +1831,8 @@
# 'sample-pages': 512} }
# <- { "return": {} }
##
-{ 'command': 'calc-dirty-rate', 'data': {'calc-time': 'int64',
+{ 'command': 'calc-dirty-rate', 'data': {'*calc-time': 'int64',
+ '*calc-time-ms': 'int64',
'*sample-pages': 'int',
'*mode': 'DirtyRateMeasureMode'} }
diff --git a/migration/dirtyrate.h b/migration/dirtyrate.h
index 594a5c0bb6..869c060941 100644
--- a/migration/dirtyrate.h
+++ b/migration/dirtyrate.h
@@ -31,10 +31,12 @@
#define MIN_RAMBLOCK_SIZE 128
/*
- * Take 1s as minimum time for calculation duration
+ * Allowed range for dirty page rate calculation (in milliseconds).
+ * Lower limit relates to the smallest realistic downtime it
+ * makes sense to impose on migration.
*/
-#define MIN_FETCH_DIRTYRATE_TIME_SEC 1
-#define MAX_FETCH_DIRTYRATE_TIME_SEC 60
+#define MIN_CALC_TIME_MS 50
+#define MAX_CALC_TIME_MS 60000
/*
* Take 1/16 pages in 1G as the maxmum sample page count
@@ -44,7 +46,7 @@
struct DirtyRateConfig {
uint64_t sample_pages_per_gigabytes; /* sample pages per GB */
- int64_t sample_period_seconds; /* time duration between two sampling */
+ int64_t calc_time_ms; /* desired calculation time (in milliseconds) */
DirtyRateMeasureMode mode; /* mode of dirtyrate measurement */
};
@@ -73,7 +75,7 @@ typedef struct SampleVMStat {
struct DirtyRateStat {
int64_t dirty_rate; /* dirty rate in MB/s */
int64_t start_time; /* calculation start time in units of second */
- int64_t calc_time; /* time duration of two sampling in units of second */
+ int64_t calc_time_ms; /* actual calculation time (in milliseconds) */
uint64_t sample_pages; /* sample pages per GB */
union {
SampleVMStat page_sampling;
diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index 84f1b0fb20..90fb336329 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -57,6 +57,8 @@ static int64_t dirty_stat_wait(int64_t msec, int64_t initial_time)
msec = current_time - initial_time;
} else {
g_usleep((msec + initial_time - current_time) * 1000);
+ /* g_usleep may overshoot */
+ msec = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - initial_time;
}
return msec;
@@ -77,9 +79,12 @@ static int64_t do_calculate_dirtyrate(DirtyPageRecord dirty_pages,
{
uint64_t increased_dirty_pages =
dirty_pages.end_pages - dirty_pages.start_pages;
- uint64_t memory_size_MiB = qemu_target_pages_to_MiB(increased_dirty_pages);
-
- return memory_size_MiB * 1000 / calc_time_ms;
+ /*
+ * multiply by 1000ms/s _before_ converting down to megabytes
+ * to avoid losing precision
+ */
+ return qemu_target_pages_to_MiB(increased_dirty_pages * 1000) /
+ calc_time_ms;
}
void global_dirty_log_change(unsigned int flag, bool start)
@@ -183,10 +188,9 @@ retry:
return duration;
}
-static bool is_sample_period_valid(int64_t sec)
+static bool is_calc_time_valid(int64_t msec)
{
- if (sec < MIN_FETCH_DIRTYRATE_TIME_SEC ||
- sec > MAX_FETCH_DIRTYRATE_TIME_SEC) {
+ if ((msec < MIN_CALC_TIME_MS) || (msec > MAX_CALC_TIME_MS)) {
return false;
}
@@ -219,7 +223,8 @@ static struct DirtyRateInfo *query_dirty_rate_info(void)
info->status = CalculatingState;
info->start_time = DirtyStat.start_time;
- info->calc_time = DirtyStat.calc_time;
+ info->calc_time_ms = DirtyStat.calc_time_ms;
+ info->calc_time = DirtyStat.calc_time_ms / 1000;
info->sample_pages = DirtyStat.sample_pages;
info->mode = dirtyrate_mode;
@@ -258,7 +263,7 @@ static void init_dirtyrate_stat(int64_t start_time,
{
DirtyStat.dirty_rate = -1;
DirtyStat.start_time = start_time;
- DirtyStat.calc_time = config.sample_period_seconds;
+ DirtyStat.calc_time_ms = config.calc_time_ms;
DirtyStat.sample_pages = config.sample_pages_per_gigabytes;
switch (config.mode) {
@@ -568,7 +573,6 @@ static inline void dirtyrate_manual_reset_protect(void)
static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config)
{
- int64_t msec = 0;
int64_t start_time;
DirtyPageRecord dirty_pages;
@@ -596,9 +600,7 @@ static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config)
start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
DirtyStat.start_time = start_time / 1000;
- msec = config.sample_period_seconds * 1000;
- msec = dirty_stat_wait(msec, start_time);
- DirtyStat.calc_time = msec / 1000;
+ DirtyStat.calc_time_ms = dirty_stat_wait(config.calc_time_ms, start_time);
/*
* do two things.
@@ -609,12 +611,12 @@ static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config)
record_dirtypages_bitmap(&dirty_pages, false);
- DirtyStat.dirty_rate = do_calculate_dirtyrate(dirty_pages, msec);
+ DirtyStat.dirty_rate = do_calculate_dirtyrate(dirty_pages,
+ DirtyStat.calc_time_ms);
}
static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config)
{
- int64_t duration;
uint64_t dirtyrate = 0;
uint64_t dirtyrate_sum = 0;
int i = 0;
@@ -625,12 +627,10 @@ static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config)
DirtyStat.start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) / 1000;
/* calculate vcpu dirtyrate */
- duration = vcpu_calculate_dirtyrate(config.sample_period_seconds * 1000,
- &DirtyStat.dirty_ring,
- GLOBAL_DIRTY_DIRTY_RATE,
- true);
-
- DirtyStat.calc_time = duration / 1000;
+ DirtyStat.calc_time_ms = vcpu_calculate_dirtyrate(config.calc_time_ms,
+ &DirtyStat.dirty_ring,
+ GLOBAL_DIRTY_DIRTY_RATE,
+ true);
/* calculate vm dirtyrate */
for (i = 0; i < DirtyStat.dirty_ring.nvcpu; i++) {
@@ -646,7 +646,6 @@ static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
{
struct RamblockDirtyInfo *block_dinfo = NULL;
int block_count = 0;
- int64_t msec = 0;
int64_t initial_time;
rcu_read_lock();
@@ -656,17 +655,16 @@ static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
}
rcu_read_unlock();
- msec = config.sample_period_seconds * 1000;
- msec = dirty_stat_wait(msec, initial_time);
+ DirtyStat.calc_time_ms = dirty_stat_wait(config.calc_time_ms,
+ initial_time);
DirtyStat.start_time = initial_time / 1000;
- DirtyStat.calc_time = msec / 1000;
rcu_read_lock();
if (!compare_page_hash_info(block_dinfo, block_count)) {
goto out;
}
- update_dirtyrate(msec);
+ update_dirtyrate(DirtyStat.calc_time_ms);
out:
rcu_read_unlock();
@@ -711,7 +709,10 @@ void *get_dirtyrate_thread(void *arg)
return NULL;
}
-void qmp_calc_dirty_rate(int64_t calc_time,
+void qmp_calc_dirty_rate(bool has_calc_time,
+ int64_t calc_time,
+ bool has_calc_time_ms,
+ int64_t calc_time_ms,
bool has_sample_pages,
int64_t sample_pages,
bool has_mode,
@@ -731,10 +732,21 @@ void qmp_calc_dirty_rate(int64_t calc_time,
return;
}
- if (!is_sample_period_valid(calc_time)) {
- error_setg(errp, "calc-time is out of range[%d, %d].",
- MIN_FETCH_DIRTYRATE_TIME_SEC,
- MAX_FETCH_DIRTYRATE_TIME_SEC);
+ if ((int)has_calc_time + (int)has_calc_time_ms != 1) {
+ error_setg(errp, "Exactly one of calc-time and calc-time-ms must"
+ " be specified");
+ return;
+ }
+ if (has_calc_time) {
+ /*
+ * The worst thing that can happen due to overflow is that
+ * invalid value will become valid.
+ */
+ calc_time_ms = calc_time * 1000;
+ }
+ if (!is_calc_time_valid(calc_time_ms)) {
+ error_setg(errp, "Calculation time is out of range[%dms, %dms].",
+ MIN_CALC_TIME_MS, MAX_CALC_TIME_MS);
return;
}
@@ -781,7 +793,7 @@ void qmp_calc_dirty_rate(int64_t calc_time,
return;
}
- config.sample_period_seconds = calc_time;
+ config.calc_time_ms = calc_time_ms;
config.sample_pages_per_gigabytes = sample_pages;
config.mode = mode;
@@ -867,8 +879,11 @@ void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict)
mode = DIRTY_RATE_MEASURE_MODE_DIRTY_RING;
}
- qmp_calc_dirty_rate(sec, has_sample_pages, sample_pages, true,
- mode, &err);
+ qmp_calc_dirty_rate(true, sec, /* calc_time */
+ false, 0, /* calc_time_ms */
+ has_sample_pages, sample_pages,
+ true, mode,
+ &err);
if (err) {
hmp_handle_error(mon, err);
return;
--
2.30.2
On Thu, Jun 29, 2023 at 11:59:03AM +0300, Andrei Gudkov wrote: > Introduces alternative argument calc-time-ms, which is the > the same as calc-time but accepts millisecond value. > Millisecond precision allows to make predictions whether > migration will succeed or not. To do this, calculate dirty > rate with calc-time-ms set to max allowed downtime, convert > measured rate into volume of dirtied memory, and divide by > network throughput. If the value is lower than max allowed > downtime, then migration will converge. > > Measurement results for single thread randomly writing to > a 24GiB region: > +--------------+--------------------+ > | calc-time-ms | dirty-rate (MiB/s) | > +--------------+--------------------+ > | 100 | 1880 | > | 200 | 1340 | > | 300 | 1120 | > | 400 | 1030 | > | 500 | 868 | > | 750 | 720 | > | 1000 | 636 | > | 1500 | 498 | > | 2000 | 423 | > +--------------+--------------------+ > > Signed-off-by: Andrei Gudkov <gudkov.andrei@huawei.com> Andrei, do you plan to enhance the commit message and data in a repost? I assume you may want to have your data points updated after the discussion, and it won't need to be in a rush as it will only land 8.2. The patch itself looks fine to me: Acked-by: Peter Xu <peterx@redhat.com> Thanks, -- Peter Xu
On Thu, Jun 29, 2023 at 11:59:03AM +0300, Andrei Gudkov wrote: > Introduces alternative argument calc-time-ms, which is the > the same as calc-time but accepts millisecond value. > Millisecond precision allows to make predictions whether > migration will succeed or not. To do this, calculate dirty > rate with calc-time-ms set to max allowed downtime, convert > measured rate into volume of dirtied memory, and divide by > network throughput. If the value is lower than max allowed > downtime, then migration will converge. > > Measurement results for single thread randomly writing to > a 24GiB region: > +--------------+--------------------+ > | calc-time-ms | dirty-rate (MiB/s) | > +--------------+--------------------+ > | 100 | 1880 | > | 200 | 1340 | > | 300 | 1120 | > | 400 | 1030 | > | 500 | 868 | > | 750 | 720 | > | 1000 | 636 | > | 1500 | 498 | > | 2000 | 423 | > +--------------+--------------------+ Do you mean the dirty workload is constant? Why it differs so much with different calc-time-ms? I also doubt usefulness on huge vms the ms accuracy won't matter much because enable/disable dirty logging overhead can already be ms level for those. > > Signed-off-by: Andrei Gudkov <gudkov.andrei@huawei.com> > --- > qapi/migration.json | 15 ++++++-- > migration/dirtyrate.h | 12 ++++--- > migration/dirtyrate.c | 81 +++++++++++++++++++++++++------------------ > 3 files changed, 68 insertions(+), 40 deletions(-) > > diff --git a/qapi/migration.json b/qapi/migration.json > index 5bb5ab82a0..dd1afe1982 100644 > --- a/qapi/migration.json > +++ b/qapi/migration.json > @@ -1778,7 +1778,12 @@ > # > # @start-time: start time in units of second for calculation > # > -# @calc-time: time in units of second for sample dirty pages > +# @calc-time: time period for which dirty page rate was measured > +# (rounded down to seconds). > +# > +# @calc-time-ms: actual time period for which dirty page rate was > +# measured (in milliseconds). Value may be larger than requested > +# time period due to measurement overhead. > # > # @sample-pages: page count per GB for sample dirty pages the default > # value is 512 (since 6.1) > @@ -1796,6 +1801,7 @@ > 'status': 'DirtyRateStatus', > 'start-time': 'int64', > 'calc-time': 'int64', > + 'calc-time-ms': 'int64', > 'sample-pages': 'uint64', > 'mode': 'DirtyRateMeasureMode', > '*vcpu-dirty-rate': [ 'DirtyRateVcpu' ] } } > @@ -1807,6 +1813,10 @@ > # > # @calc-time: time in units of second for sample dirty pages > # > +# @calc-time-ms: the same as @calc-time but in milliseconds. These > +# two arguments are mutually exclusive. Exactly one of them must > +# be specified. (Since 8.1) > +# > # @sample-pages: page count per GB for sample dirty pages the default > # value is 512 (since 6.1) > # > @@ -1821,7 +1831,8 @@ > # 'sample-pages': 512} } > # <- { "return": {} } > ## > -{ 'command': 'calc-dirty-rate', 'data': {'calc-time': 'int64', > +{ 'command': 'calc-dirty-rate', 'data': {'*calc-time': 'int64', > + '*calc-time-ms': 'int64', > '*sample-pages': 'int', > '*mode': 'DirtyRateMeasureMode'} } > > diff --git a/migration/dirtyrate.h b/migration/dirtyrate.h > index 594a5c0bb6..869c060941 100644 > --- a/migration/dirtyrate.h > +++ b/migration/dirtyrate.h > @@ -31,10 +31,12 @@ > #define MIN_RAMBLOCK_SIZE 128 > > /* > - * Take 1s as minimum time for calculation duration > + * Allowed range for dirty page rate calculation (in milliseconds). > + * Lower limit relates to the smallest realistic downtime it > + * makes sense to impose on migration. > */ > -#define MIN_FETCH_DIRTYRATE_TIME_SEC 1 > -#define MAX_FETCH_DIRTYRATE_TIME_SEC 60 > +#define MIN_CALC_TIME_MS 50 > +#define MAX_CALC_TIME_MS 60000 > > /* > * Take 1/16 pages in 1G as the maxmum sample page count > @@ -44,7 +46,7 @@ > > struct DirtyRateConfig { > uint64_t sample_pages_per_gigabytes; /* sample pages per GB */ > - int64_t sample_period_seconds; /* time duration between two sampling */ > + int64_t calc_time_ms; /* desired calculation time (in milliseconds) */ > DirtyRateMeasureMode mode; /* mode of dirtyrate measurement */ > }; > > @@ -73,7 +75,7 @@ typedef struct SampleVMStat { > struct DirtyRateStat { > int64_t dirty_rate; /* dirty rate in MB/s */ > int64_t start_time; /* calculation start time in units of second */ > - int64_t calc_time; /* time duration of two sampling in units of second */ > + int64_t calc_time_ms; /* actual calculation time (in milliseconds) */ > uint64_t sample_pages; /* sample pages per GB */ > union { > SampleVMStat page_sampling; > diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c > index 84f1b0fb20..90fb336329 100644 > --- a/migration/dirtyrate.c > +++ b/migration/dirtyrate.c > @@ -57,6 +57,8 @@ static int64_t dirty_stat_wait(int64_t msec, int64_t initial_time) > msec = current_time - initial_time; > } else { > g_usleep((msec + initial_time - current_time) * 1000); > + /* g_usleep may overshoot */ > + msec = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - initial_time; This removes the g_usleep(), is it intended? I thought it was the code to do the delay. What is the solution to g_usleep() then? > } > > return msec; > @@ -77,9 +79,12 @@ static int64_t do_calculate_dirtyrate(DirtyPageRecord dirty_pages, > { > uint64_t increased_dirty_pages = > dirty_pages.end_pages - dirty_pages.start_pages; > - uint64_t memory_size_MiB = qemu_target_pages_to_MiB(increased_dirty_pages); > - > - return memory_size_MiB * 1000 / calc_time_ms; > + /* > + * multiply by 1000ms/s _before_ converting down to megabytes > + * to avoid losing precision > + */ > + return qemu_target_pages_to_MiB(increased_dirty_pages * 1000) / > + calc_time_ms; > } > > void global_dirty_log_change(unsigned int flag, bool start) > @@ -183,10 +188,9 @@ retry: > return duration; > } > > -static bool is_sample_period_valid(int64_t sec) > +static bool is_calc_time_valid(int64_t msec) > { > - if (sec < MIN_FETCH_DIRTYRATE_TIME_SEC || > - sec > MAX_FETCH_DIRTYRATE_TIME_SEC) { > + if ((msec < MIN_CALC_TIME_MS) || (msec > MAX_CALC_TIME_MS)) { > return false; > } > > @@ -219,7 +223,8 @@ static struct DirtyRateInfo *query_dirty_rate_info(void) > > info->status = CalculatingState; > info->start_time = DirtyStat.start_time; > - info->calc_time = DirtyStat.calc_time; > + info->calc_time_ms = DirtyStat.calc_time_ms; > + info->calc_time = DirtyStat.calc_time_ms / 1000; > info->sample_pages = DirtyStat.sample_pages; > info->mode = dirtyrate_mode; > > @@ -258,7 +263,7 @@ static void init_dirtyrate_stat(int64_t start_time, > { > DirtyStat.dirty_rate = -1; > DirtyStat.start_time = start_time; > - DirtyStat.calc_time = config.sample_period_seconds; > + DirtyStat.calc_time_ms = config.calc_time_ms; > DirtyStat.sample_pages = config.sample_pages_per_gigabytes; > > switch (config.mode) { > @@ -568,7 +573,6 @@ static inline void dirtyrate_manual_reset_protect(void) > > static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config) > { > - int64_t msec = 0; > int64_t start_time; > DirtyPageRecord dirty_pages; > > @@ -596,9 +600,7 @@ static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config) > start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > DirtyStat.start_time = start_time / 1000; > > - msec = config.sample_period_seconds * 1000; > - msec = dirty_stat_wait(msec, start_time); > - DirtyStat.calc_time = msec / 1000; > + DirtyStat.calc_time_ms = dirty_stat_wait(config.calc_time_ms, start_time); > > /* > * do two things. > @@ -609,12 +611,12 @@ static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config) > > record_dirtypages_bitmap(&dirty_pages, false); > > - DirtyStat.dirty_rate = do_calculate_dirtyrate(dirty_pages, msec); > + DirtyStat.dirty_rate = do_calculate_dirtyrate(dirty_pages, > + DirtyStat.calc_time_ms); > } > > static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config) > { > - int64_t duration; > uint64_t dirtyrate = 0; > uint64_t dirtyrate_sum = 0; > int i = 0; > @@ -625,12 +627,10 @@ static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config) > DirtyStat.start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) / 1000; > > /* calculate vcpu dirtyrate */ > - duration = vcpu_calculate_dirtyrate(config.sample_period_seconds * 1000, > - &DirtyStat.dirty_ring, > - GLOBAL_DIRTY_DIRTY_RATE, > - true); > - > - DirtyStat.calc_time = duration / 1000; > + DirtyStat.calc_time_ms = vcpu_calculate_dirtyrate(config.calc_time_ms, > + &DirtyStat.dirty_ring, > + GLOBAL_DIRTY_DIRTY_RATE, > + true); > > /* calculate vm dirtyrate */ > for (i = 0; i < DirtyStat.dirty_ring.nvcpu; i++) { > @@ -646,7 +646,6 @@ static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config) > { > struct RamblockDirtyInfo *block_dinfo = NULL; > int block_count = 0; > - int64_t msec = 0; > int64_t initial_time; > > rcu_read_lock(); > @@ -656,17 +655,16 @@ static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config) > } > rcu_read_unlock(); > > - msec = config.sample_period_seconds * 1000; > - msec = dirty_stat_wait(msec, initial_time); > + DirtyStat.calc_time_ms = dirty_stat_wait(config.calc_time_ms, > + initial_time); > DirtyStat.start_time = initial_time / 1000; > - DirtyStat.calc_time = msec / 1000; > > rcu_read_lock(); > if (!compare_page_hash_info(block_dinfo, block_count)) { > goto out; > } > > - update_dirtyrate(msec); > + update_dirtyrate(DirtyStat.calc_time_ms); > > out: > rcu_read_unlock(); > @@ -711,7 +709,10 @@ void *get_dirtyrate_thread(void *arg) > return NULL; > } > > -void qmp_calc_dirty_rate(int64_t calc_time, > +void qmp_calc_dirty_rate(bool has_calc_time, > + int64_t calc_time, > + bool has_calc_time_ms, > + int64_t calc_time_ms, > bool has_sample_pages, > int64_t sample_pages, > bool has_mode, > @@ -731,10 +732,21 @@ void qmp_calc_dirty_rate(int64_t calc_time, > return; > } > > - if (!is_sample_period_valid(calc_time)) { > - error_setg(errp, "calc-time is out of range[%d, %d].", > - MIN_FETCH_DIRTYRATE_TIME_SEC, > - MAX_FETCH_DIRTYRATE_TIME_SEC); > + if ((int)has_calc_time + (int)has_calc_time_ms != 1) { > + error_setg(errp, "Exactly one of calc-time and calc-time-ms must" > + " be specified"); > + return; > + } > + if (has_calc_time) { > + /* > + * The worst thing that can happen due to overflow is that > + * invalid value will become valid. > + */ > + calc_time_ms = calc_time * 1000; > + } > + if (!is_calc_time_valid(calc_time_ms)) { > + error_setg(errp, "Calculation time is out of range[%dms, %dms].", > + MIN_CALC_TIME_MS, MAX_CALC_TIME_MS); > return; > } > > @@ -781,7 +793,7 @@ void qmp_calc_dirty_rate(int64_t calc_time, > return; > } > > - config.sample_period_seconds = calc_time; > + config.calc_time_ms = calc_time_ms; > config.sample_pages_per_gigabytes = sample_pages; > config.mode = mode; > > @@ -867,8 +879,11 @@ void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict) > mode = DIRTY_RATE_MEASURE_MODE_DIRTY_RING; > } > > - qmp_calc_dirty_rate(sec, has_sample_pages, sample_pages, true, > - mode, &err); > + qmp_calc_dirty_rate(true, sec, /* calc_time */ > + false, 0, /* calc_time_ms */ > + has_sample_pages, sample_pages, > + true, mode, > + &err); > if (err) { > hmp_handle_error(mon, err); > return; > -- > 2.30.2 > -- Peter Xu
On Thu, Jul 06, 2023 at 03:23:43PM -0400, Peter Xu wrote: > On Thu, Jun 29, 2023 at 11:59:03AM +0300, Andrei Gudkov wrote: > > Introduces alternative argument calc-time-ms, which is the > > the same as calc-time but accepts millisecond value. > > Millisecond precision allows to make predictions whether > > migration will succeed or not. To do this, calculate dirty > > rate with calc-time-ms set to max allowed downtime, convert > > measured rate into volume of dirtied memory, and divide by > > network throughput. If the value is lower than max allowed > > downtime, then migration will converge. > > > > Measurement results for single thread randomly writing to > > a 24GiB region: > > +--------------+--------------------+ > > | calc-time-ms | dirty-rate (MiB/s) | > > +--------------+--------------------+ > > | 100 | 1880 | > > | 200 | 1340 | > > | 300 | 1120 | > > | 400 | 1030 | > > | 500 | 868 | > > | 750 | 720 | > > | 1000 | 636 | > > | 1500 | 498 | > > | 2000 | 423 | > > +--------------+--------------------+ > > Do you mean the dirty workload is constant? Why it differs so much with > different calc-time-ms? Workload is as constant as it could be. But the naming is misleading. What is named "dirty-rate" in fact is not "rate" at all. calc-dirty-rate measures number of *uniquely* dirtied pages, i.e. each page can contribute to the counter only once during measurement period. That's why the values are decreasing. Consider also ad infinitum argument: since VM has fixed number of pages and each page can be dirtied only once, dirty-rate=number-of-dirtied-pages/calc-time -> 0 as calc-time -> inf. It would make more sense to report number as "dirty-volume" -- without dividing it by calc-time. Note that number of *uniquely* dirtied pages in given amount of time is exactly what we need for doing migration-related predictions. There is no error here. > > I also doubt usefulness on huge vms the ms accuracy won't matter much > because enable/disable dirty logging overhead can already be ms level for > those. > The goal is to measure volume of dirtied memory corresponding to desired downtime, which is typically order of 100-300ms. I think that error of +/-10ms won't make any big difference. Maybe I should've called this "millisecond granularity" and not "millisecond precision". > > > > Signed-off-by: Andrei Gudkov <gudkov.andrei@huawei.com> > > --- > > qapi/migration.json | 15 ++++++-- > > migration/dirtyrate.h | 12 ++++--- > > migration/dirtyrate.c | 81 +++++++++++++++++++++++++------------------ > > 3 files changed, 68 insertions(+), 40 deletions(-) > > > > diff --git a/qapi/migration.json b/qapi/migration.json > > index 5bb5ab82a0..dd1afe1982 100644 > > --- a/qapi/migration.json > > +++ b/qapi/migration.json > > @@ -1778,7 +1778,12 @@ > > # > > # @start-time: start time in units of second for calculation > > # > > -# @calc-time: time in units of second for sample dirty pages > > +# @calc-time: time period for which dirty page rate was measured > > +# (rounded down to seconds). > > +# > > +# @calc-time-ms: actual time period for which dirty page rate was > > +# measured (in milliseconds). Value may be larger than requested > > +# time period due to measurement overhead. > > # > > # @sample-pages: page count per GB for sample dirty pages the default > > # value is 512 (since 6.1) > > @@ -1796,6 +1801,7 @@ > > 'status': 'DirtyRateStatus', > > 'start-time': 'int64', > > 'calc-time': 'int64', > > + 'calc-time-ms': 'int64', > > 'sample-pages': 'uint64', > > 'mode': 'DirtyRateMeasureMode', > > '*vcpu-dirty-rate': [ 'DirtyRateVcpu' ] } } > > @@ -1807,6 +1813,10 @@ > > # > > # @calc-time: time in units of second for sample dirty pages > > # > > +# @calc-time-ms: the same as @calc-time but in milliseconds. These > > +# two arguments are mutually exclusive. Exactly one of them must > > +# be specified. (Since 8.1) > > +# > > # @sample-pages: page count per GB for sample dirty pages the default > > # value is 512 (since 6.1) > > # > > @@ -1821,7 +1831,8 @@ > > # 'sample-pages': 512} } > > # <- { "return": {} } > > ## > > -{ 'command': 'calc-dirty-rate', 'data': {'calc-time': 'int64', > > +{ 'command': 'calc-dirty-rate', 'data': {'*calc-time': 'int64', > > + '*calc-time-ms': 'int64', > > '*sample-pages': 'int', > > '*mode': 'DirtyRateMeasureMode'} } > > > > diff --git a/migration/dirtyrate.h b/migration/dirtyrate.h > > index 594a5c0bb6..869c060941 100644 > > --- a/migration/dirtyrate.h > > +++ b/migration/dirtyrate.h > > @@ -31,10 +31,12 @@ > > #define MIN_RAMBLOCK_SIZE 128 > > > > /* > > - * Take 1s as minimum time for calculation duration > > + * Allowed range for dirty page rate calculation (in milliseconds). > > + * Lower limit relates to the smallest realistic downtime it > > + * makes sense to impose on migration. > > */ > > -#define MIN_FETCH_DIRTYRATE_TIME_SEC 1 > > -#define MAX_FETCH_DIRTYRATE_TIME_SEC 60 > > +#define MIN_CALC_TIME_MS 50 > > +#define MAX_CALC_TIME_MS 60000 > > > > /* > > * Take 1/16 pages in 1G as the maxmum sample page count > > @@ -44,7 +46,7 @@ > > > > struct DirtyRateConfig { > > uint64_t sample_pages_per_gigabytes; /* sample pages per GB */ > > - int64_t sample_period_seconds; /* time duration between two sampling */ > > + int64_t calc_time_ms; /* desired calculation time (in milliseconds) */ > > DirtyRateMeasureMode mode; /* mode of dirtyrate measurement */ > > }; > > > > @@ -73,7 +75,7 @@ typedef struct SampleVMStat { > > struct DirtyRateStat { > > int64_t dirty_rate; /* dirty rate in MB/s */ > > int64_t start_time; /* calculation start time in units of second */ > > - int64_t calc_time; /* time duration of two sampling in units of second */ > > + int64_t calc_time_ms; /* actual calculation time (in milliseconds) */ > > uint64_t sample_pages; /* sample pages per GB */ > > union { > > SampleVMStat page_sampling; > > diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c > > index 84f1b0fb20..90fb336329 100644 > > --- a/migration/dirtyrate.c > > +++ b/migration/dirtyrate.c > > @@ -57,6 +57,8 @@ static int64_t dirty_stat_wait(int64_t msec, int64_t initial_time) > > msec = current_time - initial_time; > > } else { > > g_usleep((msec + initial_time - current_time) * 1000); > > + /* g_usleep may overshoot */ > > + msec = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - initial_time; > > This removes the g_usleep(), is it intended? I thought it was the code to > do the delay. What is the solution to g_usleep() then? > It is still there -- just above the comment. What I added is qemu_clock_get_ms() call after the g_usleep. It sets msec variable to the true amount of time elapsed between measurements. Previously I detected that actually slept time may be slightly longer than requested value. > > } > > > > return msec; > > @@ -77,9 +79,12 @@ static int64_t do_calculate_dirtyrate(DirtyPageRecord dirty_pages, > > { > > uint64_t increased_dirty_pages = > > dirty_pages.end_pages - dirty_pages.start_pages; > > - uint64_t memory_size_MiB = qemu_target_pages_to_MiB(increased_dirty_pages); > > - > > - return memory_size_MiB * 1000 / calc_time_ms; > > + /* > > + * multiply by 1000ms/s _before_ converting down to megabytes > > + * to avoid losing precision > > + */ > > + return qemu_target_pages_to_MiB(increased_dirty_pages * 1000) / > > + calc_time_ms; > > } > > > > void global_dirty_log_change(unsigned int flag, bool start) > > @@ -183,10 +188,9 @@ retry: > > return duration; > > } > > > > -static bool is_sample_period_valid(int64_t sec) > > +static bool is_calc_time_valid(int64_t msec) > > { > > - if (sec < MIN_FETCH_DIRTYRATE_TIME_SEC || > > - sec > MAX_FETCH_DIRTYRATE_TIME_SEC) { > > + if ((msec < MIN_CALC_TIME_MS) || (msec > MAX_CALC_TIME_MS)) { > > return false; > > } > > > > @@ -219,7 +223,8 @@ static struct DirtyRateInfo *query_dirty_rate_info(void) > > > > info->status = CalculatingState; > > info->start_time = DirtyStat.start_time; > > - info->calc_time = DirtyStat.calc_time; > > + info->calc_time_ms = DirtyStat.calc_time_ms; > > + info->calc_time = DirtyStat.calc_time_ms / 1000; > > info->sample_pages = DirtyStat.sample_pages; > > info->mode = dirtyrate_mode; > > > > @@ -258,7 +263,7 @@ static void init_dirtyrate_stat(int64_t start_time, > > { > > DirtyStat.dirty_rate = -1; > > DirtyStat.start_time = start_time; > > - DirtyStat.calc_time = config.sample_period_seconds; > > + DirtyStat.calc_time_ms = config.calc_time_ms; > > DirtyStat.sample_pages = config.sample_pages_per_gigabytes; > > > > switch (config.mode) { > > @@ -568,7 +573,6 @@ static inline void dirtyrate_manual_reset_protect(void) > > > > static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config) > > { > > - int64_t msec = 0; > > int64_t start_time; > > DirtyPageRecord dirty_pages; > > > > @@ -596,9 +600,7 @@ static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config) > > start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > > DirtyStat.start_time = start_time / 1000; > > > > - msec = config.sample_period_seconds * 1000; > > - msec = dirty_stat_wait(msec, start_time); > > - DirtyStat.calc_time = msec / 1000; > > + DirtyStat.calc_time_ms = dirty_stat_wait(config.calc_time_ms, start_time); > > > > /* > > * do two things. > > @@ -609,12 +611,12 @@ static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config) > > > > record_dirtypages_bitmap(&dirty_pages, false); > > > > - DirtyStat.dirty_rate = do_calculate_dirtyrate(dirty_pages, msec); > > + DirtyStat.dirty_rate = do_calculate_dirtyrate(dirty_pages, > > + DirtyStat.calc_time_ms); > > } > > > > static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config) > > { > > - int64_t duration; > > uint64_t dirtyrate = 0; > > uint64_t dirtyrate_sum = 0; > > int i = 0; > > @@ -625,12 +627,10 @@ static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config) > > DirtyStat.start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) / 1000; > > > > /* calculate vcpu dirtyrate */ > > - duration = vcpu_calculate_dirtyrate(config.sample_period_seconds * 1000, > > - &DirtyStat.dirty_ring, > > - GLOBAL_DIRTY_DIRTY_RATE, > > - true); > > - > > - DirtyStat.calc_time = duration / 1000; > > + DirtyStat.calc_time_ms = vcpu_calculate_dirtyrate(config.calc_time_ms, > > + &DirtyStat.dirty_ring, > > + GLOBAL_DIRTY_DIRTY_RATE, > > + true); > > > > /* calculate vm dirtyrate */ > > for (i = 0; i < DirtyStat.dirty_ring.nvcpu; i++) { > > @@ -646,7 +646,6 @@ static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config) > > { > > struct RamblockDirtyInfo *block_dinfo = NULL; > > int block_count = 0; > > - int64_t msec = 0; > > int64_t initial_time; > > > > rcu_read_lock(); > > @@ -656,17 +655,16 @@ static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config) > > } > > rcu_read_unlock(); > > > > - msec = config.sample_period_seconds * 1000; > > - msec = dirty_stat_wait(msec, initial_time); > > + DirtyStat.calc_time_ms = dirty_stat_wait(config.calc_time_ms, > > + initial_time); > > DirtyStat.start_time = initial_time / 1000; > > - DirtyStat.calc_time = msec / 1000; > > > > rcu_read_lock(); > > if (!compare_page_hash_info(block_dinfo, block_count)) { > > goto out; > > } > > > > - update_dirtyrate(msec); > > + update_dirtyrate(DirtyStat.calc_time_ms); > > > > out: > > rcu_read_unlock(); > > @@ -711,7 +709,10 @@ void *get_dirtyrate_thread(void *arg) > > return NULL; > > } > > > > -void qmp_calc_dirty_rate(int64_t calc_time, > > +void qmp_calc_dirty_rate(bool has_calc_time, > > + int64_t calc_time, > > + bool has_calc_time_ms, > > + int64_t calc_time_ms, > > bool has_sample_pages, > > int64_t sample_pages, > > bool has_mode, > > @@ -731,10 +732,21 @@ void qmp_calc_dirty_rate(int64_t calc_time, > > return; > > } > > > > - if (!is_sample_period_valid(calc_time)) { > > - error_setg(errp, "calc-time is out of range[%d, %d].", > > - MIN_FETCH_DIRTYRATE_TIME_SEC, > > - MAX_FETCH_DIRTYRATE_TIME_SEC); > > + if ((int)has_calc_time + (int)has_calc_time_ms != 1) { > > + error_setg(errp, "Exactly one of calc-time and calc-time-ms must" > > + " be specified"); > > + return; > > + } > > + if (has_calc_time) { > > + /* > > + * The worst thing that can happen due to overflow is that > > + * invalid value will become valid. > > + */ > > + calc_time_ms = calc_time * 1000; > > + } > > + if (!is_calc_time_valid(calc_time_ms)) { > > + error_setg(errp, "Calculation time is out of range[%dms, %dms].", > > + MIN_CALC_TIME_MS, MAX_CALC_TIME_MS); > > return; > > } > > > > @@ -781,7 +793,7 @@ void qmp_calc_dirty_rate(int64_t calc_time, > > return; > > } > > > > - config.sample_period_seconds = calc_time; > > + config.calc_time_ms = calc_time_ms; > > config.sample_pages_per_gigabytes = sample_pages; > > config.mode = mode; > > > > @@ -867,8 +879,11 @@ void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict) > > mode = DIRTY_RATE_MEASURE_MODE_DIRTY_RING; > > } > > > > - qmp_calc_dirty_rate(sec, has_sample_pages, sample_pages, true, > > - mode, &err); > > + qmp_calc_dirty_rate(true, sec, /* calc_time */ > > + false, 0, /* calc_time_ms */ > > + has_sample_pages, sample_pages, > > + true, mode, > > + &err); > > if (err) { > > hmp_handle_error(mon, err); > > return; > > -- > > 2.30.2 > > > > -- > Peter Xu
On Tue, Jul 11, 2023 at 03:38:18PM +0300, gudkov.andrei@huawei.com wrote: > On Thu, Jul 06, 2023 at 03:23:43PM -0400, Peter Xu wrote: > > On Thu, Jun 29, 2023 at 11:59:03AM +0300, Andrei Gudkov wrote: > > > Introduces alternative argument calc-time-ms, which is the > > > the same as calc-time but accepts millisecond value. > > > Millisecond precision allows to make predictions whether > > > migration will succeed or not. To do this, calculate dirty > > > rate with calc-time-ms set to max allowed downtime, convert > > > measured rate into volume of dirtied memory, and divide by > > > network throughput. If the value is lower than max allowed > > > downtime, then migration will converge. > > > > > > Measurement results for single thread randomly writing to > > > a 24GiB region: > > > +--------------+--------------------+ > > > | calc-time-ms | dirty-rate (MiB/s) | > > > +--------------+--------------------+ > > > | 100 | 1880 | > > > | 200 | 1340 | > > > | 300 | 1120 | > > > | 400 | 1030 | > > > | 500 | 868 | > > > | 750 | 720 | > > > | 1000 | 636 | > > > | 1500 | 498 | > > > | 2000 | 423 | > > > +--------------+--------------------+ > > > > Do you mean the dirty workload is constant? Why it differs so much with > > different calc-time-ms? > > Workload is as constant as it could be. But the naming is misleading. > What is named "dirty-rate" in fact is not "rate" at all. > calc-dirty-rate measures number of *uniquely* dirtied pages, i.e. each > page can contribute to the counter only once during measurement period. > That's why the values are decreasing. Consider also ad infinitum argument: > since VM has fixed number of pages and each page can be dirtied only once, > dirty-rate=number-of-dirtied-pages/calc-time -> 0 as calc-time -> inf. > It would make more sense to report number as "dirty-volume" -- > without dividing it by calc-time. > > Note that number of *uniquely* dirtied pages in given amount of time is > exactly what we need for doing migration-related predictions. There is > no error here. Is calc-time-ms the duration of the measurement? Taking the 1st line as example, 1880MB/s * 0.1s = 188MB. For the 2nd line, 1340MB/s * 0.2s = 268MB. Even for the longest duration of 2s, that's 846MB in total. The range is 24GB. In this case, most of the pages should only be written once even if random for all these test durations, right? > > > > > I also doubt usefulness on huge vms the ms accuracy won't matter much > > because enable/disable dirty logging overhead can already be ms level for > > those. > > > The goal is to measure volume of dirtied memory corresponding to desired > downtime, which is typically order of 100-300ms. I think that error of > +/-10ms won't make any big difference. Maybe I should've called this > "millisecond granularity" and not "millisecond precision". > > > > > > > Signed-off-by: Andrei Gudkov <gudkov.andrei@huawei.com> > > > --- > > > qapi/migration.json | 15 ++++++-- > > > migration/dirtyrate.h | 12 ++++--- > > > migration/dirtyrate.c | 81 +++++++++++++++++++++++++------------------ > > > 3 files changed, 68 insertions(+), 40 deletions(-) > > > > > > diff --git a/qapi/migration.json b/qapi/migration.json > > > index 5bb5ab82a0..dd1afe1982 100644 > > > --- a/qapi/migration.json > > > +++ b/qapi/migration.json > > > @@ -1778,7 +1778,12 @@ > > > # > > > # @start-time: start time in units of second for calculation > > > # > > > -# @calc-time: time in units of second for sample dirty pages > > > +# @calc-time: time period for which dirty page rate was measured > > > +# (rounded down to seconds). > > > +# > > > +# @calc-time-ms: actual time period for which dirty page rate was > > > +# measured (in milliseconds). Value may be larger than requested > > > +# time period due to measurement overhead. > > > # > > > # @sample-pages: page count per GB for sample dirty pages the default > > > # value is 512 (since 6.1) > > > @@ -1796,6 +1801,7 @@ > > > 'status': 'DirtyRateStatus', > > > 'start-time': 'int64', > > > 'calc-time': 'int64', > > > + 'calc-time-ms': 'int64', > > > 'sample-pages': 'uint64', > > > 'mode': 'DirtyRateMeasureMode', > > > '*vcpu-dirty-rate': [ 'DirtyRateVcpu' ] } } > > > @@ -1807,6 +1813,10 @@ > > > # > > > # @calc-time: time in units of second for sample dirty pages > > > # > > > +# @calc-time-ms: the same as @calc-time but in milliseconds. These > > > +# two arguments are mutually exclusive. Exactly one of them must > > > +# be specified. (Since 8.1) > > > +# > > > # @sample-pages: page count per GB for sample dirty pages the default > > > # value is 512 (since 6.1) > > > # > > > @@ -1821,7 +1831,8 @@ > > > # 'sample-pages': 512} } > > > # <- { "return": {} } > > > ## > > > -{ 'command': 'calc-dirty-rate', 'data': {'calc-time': 'int64', > > > +{ 'command': 'calc-dirty-rate', 'data': {'*calc-time': 'int64', > > > + '*calc-time-ms': 'int64', > > > '*sample-pages': 'int', > > > '*mode': 'DirtyRateMeasureMode'} } > > > > > > diff --git a/migration/dirtyrate.h b/migration/dirtyrate.h > > > index 594a5c0bb6..869c060941 100644 > > > --- a/migration/dirtyrate.h > > > +++ b/migration/dirtyrate.h > > > @@ -31,10 +31,12 @@ > > > #define MIN_RAMBLOCK_SIZE 128 > > > > > > /* > > > - * Take 1s as minimum time for calculation duration > > > + * Allowed range for dirty page rate calculation (in milliseconds). > > > + * Lower limit relates to the smallest realistic downtime it > > > + * makes sense to impose on migration. > > > */ > > > -#define MIN_FETCH_DIRTYRATE_TIME_SEC 1 > > > -#define MAX_FETCH_DIRTYRATE_TIME_SEC 60 > > > +#define MIN_CALC_TIME_MS 50 > > > +#define MAX_CALC_TIME_MS 60000 > > > > > > /* > > > * Take 1/16 pages in 1G as the maxmum sample page count > > > @@ -44,7 +46,7 @@ > > > > > > struct DirtyRateConfig { > > > uint64_t sample_pages_per_gigabytes; /* sample pages per GB */ > > > - int64_t sample_period_seconds; /* time duration between two sampling */ > > > + int64_t calc_time_ms; /* desired calculation time (in milliseconds) */ > > > DirtyRateMeasureMode mode; /* mode of dirtyrate measurement */ > > > }; > > > > > > @@ -73,7 +75,7 @@ typedef struct SampleVMStat { > > > struct DirtyRateStat { > > > int64_t dirty_rate; /* dirty rate in MB/s */ > > > int64_t start_time; /* calculation start time in units of second */ > > > - int64_t calc_time; /* time duration of two sampling in units of second */ > > > + int64_t calc_time_ms; /* actual calculation time (in milliseconds) */ > > > uint64_t sample_pages; /* sample pages per GB */ > > > union { > > > SampleVMStat page_sampling; > > > diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c > > > index 84f1b0fb20..90fb336329 100644 > > > --- a/migration/dirtyrate.c > > > +++ b/migration/dirtyrate.c > > > @@ -57,6 +57,8 @@ static int64_t dirty_stat_wait(int64_t msec, int64_t initial_time) > > > msec = current_time - initial_time; > > > } else { > > > g_usleep((msec + initial_time - current_time) * 1000); > > > + /* g_usleep may overshoot */ > > > + msec = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - initial_time; > > > > This removes the g_usleep(), is it intended? I thought it was the code to > > do the delay. What is the solution to g_usleep() then? > > > It is still there -- just above the comment. What I added is > qemu_clock_get_ms() call after the g_usleep. It sets msec variable > to the true amount of time elapsed between measurements. Previously > I detected that actually slept time may be slightly longer than > requested value. Ouch.. please ignore that. I don't know how did I read that. The impl is pretty much straightforward and fine by me, I just want to make sure I still understand those numbers you provided in the commit message, and making sure measuring in small duration can at least make sense in majority cases. Thanks, -- Peter Xu
On Mon, Jul 17, 2023 at 03:08:37PM -0400, Peter Xu wrote: > On Tue, Jul 11, 2023 at 03:38:18PM +0300, gudkov.andrei@huawei.com wrote: > > On Thu, Jul 06, 2023 at 03:23:43PM -0400, Peter Xu wrote: > > > On Thu, Jun 29, 2023 at 11:59:03AM +0300, Andrei Gudkov wrote: > > > > Introduces alternative argument calc-time-ms, which is the > > > > the same as calc-time but accepts millisecond value. > > > > Millisecond precision allows to make predictions whether > > > > migration will succeed or not. To do this, calculate dirty > > > > rate with calc-time-ms set to max allowed downtime, convert > > > > measured rate into volume of dirtied memory, and divide by > > > > network throughput. If the value is lower than max allowed > > > > downtime, then migration will converge. > > > > > > > > Measurement results for single thread randomly writing to > > > > a 24GiB region: > > > > +--------------+--------------------+ > > > > | calc-time-ms | dirty-rate (MiB/s) | > > > > +--------------+--------------------+ > > > > | 100 | 1880 | > > > > | 200 | 1340 | > > > > | 300 | 1120 | > > > > | 400 | 1030 | > > > > | 500 | 868 | > > > > | 750 | 720 | > > > > | 1000 | 636 | > > > > | 1500 | 498 | > > > > | 2000 | 423 | > > > > +--------------+--------------------+ > > > > > > Do you mean the dirty workload is constant? Why it differs so much with > > > different calc-time-ms? > > > > Workload is as constant as it could be. But the naming is misleading. > > What is named "dirty-rate" in fact is not "rate" at all. > > calc-dirty-rate measures number of *uniquely* dirtied pages, i.e. each > > page can contribute to the counter only once during measurement period. > > That's why the values are decreasing. Consider also ad infinitum argument: > > since VM has fixed number of pages and each page can be dirtied only once, > > dirty-rate=number-of-dirtied-pages/calc-time -> 0 as calc-time -> inf. > > It would make more sense to report number as "dirty-volume" -- > > without dividing it by calc-time. > > > > Note that number of *uniquely* dirtied pages in given amount of time is > > exactly what we need for doing migration-related predictions. There is > > no error here. > > Is calc-time-ms the duration of the measurement? > > Taking the 1st line as example, 1880MB/s * 0.1s = 188MB. > For the 2nd line, 1340MB/s * 0.2s = 268MB. > Even for the longest duration of 2s, that's 846MB in total. > > The range is 24GB. In this case, most of the pages should only be written > once even if random for all these test durations, right? > Yes, I messed with load generator. The effective memory region was much smaller than 24GiB. I performed more testing (after fixing load generator), now with different memory sizes and different modes. +--------------+-----------------------------------------------+ | calc-time-ms | dirty rate MiB/s | | +----------------+---------------+--------------+ | | theoretical | page-sampling | dirty-bitmap | | | (at 3M wr/sec) | | | +--------------+----------------+---------------+--------------+ | 1GiB | +--------------+----------------+---------------+--------------+ | 100 | 6996 | 7100 | 3192 | | 200 | 4606 | 4660 | 2655 | | 300 | 3305 | 3280 | 2371 | | 400 | 2534 | 2525 | 2154 | | 500 | 2041 | 2044 | 1871 | | 750 | 1365 | 1341 | 1358 | | 1000 | 1024 | 1052 | 1025 | | 1500 | 683 | 678 | 684 | | 2000 | 512 | 507 | 513 | +--------------+----------------+---------------+--------------+ | 4GiB | +--------------+----------------+---------------+--------------+ | 100 | 10232 | 8880 | 4070 | | 200 | 8954 | 8049 | 3195 | | 300 | 7889 | 7193 | 2881 | | 400 | 6996 | 6530 | 2700 | | 500 | 6245 | 5772 | 2312 | | 750 | 4829 | 4586 | 2465 | | 1000 | 3865 | 3780 | 2178 | | 1500 | 2694 | 2633 | 2004 | | 2000 | 2041 | 2031 | 1789 | +--------------+----------------+---------------+--------------+ | 24GiB | +--------------+----------------+---------------+--------------+ | 100 | 11495 | 8640 | 5597 | | 200 | 11226 | 8616 | 3527 | | 300 | 10965 | 8386 | 2355 | | 400 | 10713 | 8370 | 2179 | | 500 | 10469 | 8196 | 2098 | | 750 | 9890 | 7885 | 2556 | | 1000 | 9354 | 7506 | 2084 | | 1500 | 8397 | 6944 | 2075 | | 2000 | 7574 | 6402 | 2062 | +--------------+----------------+---------------+--------------+ Theoretical values are computed according to the following formula: size * (1 - (1-(4096/size))^(time*wps)) / (time * 2^20), where size is in bytes, time is in seconds, and wps is number of writes per second (I measured approximately 3000000 on my system). Theoretical values and values obtained with page-sampling are approximately close (<=25%). Dirty-bitmap values are much lower, likely because the majority of writes cause page faults. Even though dirty-bitmap logic is closer to what is happening during live migration, I still favor page sampling because the latter doesn't impact the performance of VM too much. Whether calc-time < 1sec is meaningful or not depends on the size of memory region with active writes. 1. If we have big VM and writes are evenly spread over the whole address space, then almost all writes will go into unique pages. In this case number of dirty pages will grow approximately linearly with time for small calc-time values. 2. But if memory region with active writes is small enough, then many writes will go to the same page, and the number of dirty pages will grow sublinearly even for small calc-time values. Note that the second scenario can happen even VM RAM is big. For example, imagine 128GiB VM with in-memory database that is used for reading. Although VM size is big, the memory region with active writes is just the application stack.
Hi, Andrei, On Mon, Jul 31, 2023 at 05:51:49PM +0300, gudkov.andrei@huawei.com wrote: > On Mon, Jul 17, 2023 at 03:08:37PM -0400, Peter Xu wrote: > > On Tue, Jul 11, 2023 at 03:38:18PM +0300, gudkov.andrei@huawei.com wrote: > > > On Thu, Jul 06, 2023 at 03:23:43PM -0400, Peter Xu wrote: > > > > On Thu, Jun 29, 2023 at 11:59:03AM +0300, Andrei Gudkov wrote: > > > > > Introduces alternative argument calc-time-ms, which is the > > > > > the same as calc-time but accepts millisecond value. > > > > > Millisecond precision allows to make predictions whether > > > > > migration will succeed or not. To do this, calculate dirty > > > > > rate with calc-time-ms set to max allowed downtime, convert > > > > > measured rate into volume of dirtied memory, and divide by > > > > > network throughput. If the value is lower than max allowed > > > > > downtime, then migration will converge. > > > > > > > > > > Measurement results for single thread randomly writing to > > > > > a 24GiB region: > > > > > +--------------+--------------------+ > > > > > | calc-time-ms | dirty-rate (MiB/s) | > > > > > +--------------+--------------------+ > > > > > | 100 | 1880 | > > > > > | 200 | 1340 | > > > > > | 300 | 1120 | > > > > > | 400 | 1030 | > > > > > | 500 | 868 | > > > > > | 750 | 720 | > > > > > | 1000 | 636 | > > > > > | 1500 | 498 | > > > > > | 2000 | 423 | > > > > > +--------------+--------------------+ > > > > > > > > Do you mean the dirty workload is constant? Why it differs so much with > > > > different calc-time-ms? > > > > > > Workload is as constant as it could be. But the naming is misleading. > > > What is named "dirty-rate" in fact is not "rate" at all. > > > calc-dirty-rate measures number of *uniquely* dirtied pages, i.e. each > > > page can contribute to the counter only once during measurement period. > > > That's why the values are decreasing. Consider also ad infinitum argument: > > > since VM has fixed number of pages and each page can be dirtied only once, > > > dirty-rate=number-of-dirtied-pages/calc-time -> 0 as calc-time -> inf. > > > It would make more sense to report number as "dirty-volume" -- > > > without dividing it by calc-time. > > > > > > Note that number of *uniquely* dirtied pages in given amount of time is > > > exactly what we need for doing migration-related predictions. There is > > > no error here. > > > > Is calc-time-ms the duration of the measurement? > > > > Taking the 1st line as example, 1880MB/s * 0.1s = 188MB. > > For the 2nd line, 1340MB/s * 0.2s = 268MB. > > Even for the longest duration of 2s, that's 846MB in total. > > > > The range is 24GB. In this case, most of the pages should only be written > > once even if random for all these test durations, right? > > > > Yes, I messed with load generator. > The effective memory region was much smaller than 24GiB. > I performed more testing (after fixing load generator), > now with different memory sizes and different modes. > > +--------------+-----------------------------------------------+ > | calc-time-ms | dirty rate MiB/s | > | +----------------+---------------+--------------+ > | | theoretical | page-sampling | dirty-bitmap | > | | (at 3M wr/sec) | | | > +--------------+----------------+---------------+--------------+ > | 1GiB | > +--------------+----------------+---------------+--------------+ > | 100 | 6996 | 7100 | 3192 | > | 200 | 4606 | 4660 | 2655 | > | 300 | 3305 | 3280 | 2371 | > | 400 | 2534 | 2525 | 2154 | > | 500 | 2041 | 2044 | 1871 | > | 750 | 1365 | 1341 | 1358 | > | 1000 | 1024 | 1052 | 1025 | > | 1500 | 683 | 678 | 684 | > | 2000 | 512 | 507 | 513 | > +--------------+----------------+---------------+--------------+ > | 4GiB | > +--------------+----------------+---------------+--------------+ > | 100 | 10232 | 8880 | 4070 | > | 200 | 8954 | 8049 | 3195 | > | 300 | 7889 | 7193 | 2881 | > | 400 | 6996 | 6530 | 2700 | > | 500 | 6245 | 5772 | 2312 | > | 750 | 4829 | 4586 | 2465 | > | 1000 | 3865 | 3780 | 2178 | > | 1500 | 2694 | 2633 | 2004 | > | 2000 | 2041 | 2031 | 1789 | > +--------------+----------------+---------------+--------------+ > | 24GiB | > +--------------+----------------+---------------+--------------+ > | 100 | 11495 | 8640 | 5597 | > | 200 | 11226 | 8616 | 3527 | > | 300 | 10965 | 8386 | 2355 | > | 400 | 10713 | 8370 | 2179 | > | 500 | 10469 | 8196 | 2098 | > | 750 | 9890 | 7885 | 2556 | > | 1000 | 9354 | 7506 | 2084 | > | 1500 | 8397 | 6944 | 2075 | > | 2000 | 7574 | 6402 | 2062 | > +--------------+----------------+---------------+--------------+ > > Theoretical values are computed according to the following formula: > size * (1 - (1-(4096/size))^(time*wps)) / (time * 2^20), Thanks for more testings and the statistics. I had a feeling that this formula may or may not be accurate, but that's less of an issue here. > where size is in bytes, time is in seconds, and wps is number of > writes per second (I measured approximately 3000000 on my system). > > Theoretical values and values obtained with page-sampling are > approximately close (<=25%). Dirty-bitmap values are much lower, > likely because the majority of writes cause page faults. Even though > dirty-bitmap logic is closer to what is happening during live > migration, I still favor page sampling because the latter doesn't > impact the performance of VM too much. Do you really use page samplings in production? I don't remember I mentioned it anywhere before, but it will provide very wrong number when the memory updates has a locality, afaik. For example, when 4G VM only has 1G actively updated, the result can be 25% of reality iiuc, seeing that the rest 3G didn't even change. It works only well with very distributed memory updates. > > Whether calc-time < 1sec is meaningful or not depends on the size > of memory region with active writes. > 1. If we have big VM and writes are evenly spread over the whole > address space, then almost all writes will go into unique pages. > In this case number of dirty pages will grow approximately > linearly with time for small calc-time values. > 2. But if memory region with active writes is small enough, then many > writes will go to the same page, and the number of dirty pages > will grow sublinearly even for small calc-time values. Note that > the second scenario can happen even VM RAM is big. For example, > imagine 128GiB VM with in-memory database that is used for reading. > Although VM size is big, the memory region with active writes is > just the application stack. No issue here to support small calc-time. I think as long as it'll be worthwhile in any use case I'd be fine with it (rather than working for all use cases). Not a super high bar to maintain the change. I copied Yong too, he just volunteered to look after the dirtyrate stuff. Thanks, -- Peter Xu
On Mon, Jul 31, 2023 at 04:06:24PM -0400, Peter Xu wrote: > Hi, Andrei, > > On Mon, Jul 31, 2023 at 05:51:49PM +0300, gudkov.andrei@huawei.com wrote: > > On Mon, Jul 17, 2023 at 03:08:37PM -0400, Peter Xu wrote: > > > On Tue, Jul 11, 2023 at 03:38:18PM +0300, gudkov.andrei@huawei.com wrote: > > > > On Thu, Jul 06, 2023 at 03:23:43PM -0400, Peter Xu wrote: > > > > > On Thu, Jun 29, 2023 at 11:59:03AM +0300, Andrei Gudkov wrote: > > > > > > Introduces alternative argument calc-time-ms, which is the > > > > > > the same as calc-time but accepts millisecond value. > > > > > > Millisecond precision allows to make predictions whether > > > > > > migration will succeed or not. To do this, calculate dirty > > > > > > rate with calc-time-ms set to max allowed downtime, convert > > > > > > measured rate into volume of dirtied memory, and divide by > > > > > > network throughput. If the value is lower than max allowed > > > > > > downtime, then migration will converge. > > > > > > > > > > > > Measurement results for single thread randomly writing to > > > > > > a 24GiB region: > > > > > > +--------------+--------------------+ > > > > > > | calc-time-ms | dirty-rate (MiB/s) | > > > > > > +--------------+--------------------+ > > > > > > | 100 | 1880 | > > > > > > | 200 | 1340 | > > > > > > | 300 | 1120 | > > > > > > | 400 | 1030 | > > > > > > | 500 | 868 | > > > > > > | 750 | 720 | > > > > > > | 1000 | 636 | > > > > > > | 1500 | 498 | > > > > > > | 2000 | 423 | > > > > > > +--------------+--------------------+ > > > > > > > > > > Do you mean the dirty workload is constant? Why it differs so much with > > > > > different calc-time-ms? > > > > > > > > Workload is as constant as it could be. But the naming is misleading. > > > > What is named "dirty-rate" in fact is not "rate" at all. > > > > calc-dirty-rate measures number of *uniquely* dirtied pages, i.e. each > > > > page can contribute to the counter only once during measurement period. > > > > That's why the values are decreasing. Consider also ad infinitum argument: > > > > since VM has fixed number of pages and each page can be dirtied only once, > > > > dirty-rate=number-of-dirtied-pages/calc-time -> 0 as calc-time -> inf. > > > > It would make more sense to report number as "dirty-volume" -- > > > > without dividing it by calc-time. > > > > > > > > Note that number of *uniquely* dirtied pages in given amount of time is > > > > exactly what we need for doing migration-related predictions. There is > > > > no error here. > > > > > > Is calc-time-ms the duration of the measurement? > > > > > > Taking the 1st line as example, 1880MB/s * 0.1s = 188MB. > > > For the 2nd line, 1340MB/s * 0.2s = 268MB. > > > Even for the longest duration of 2s, that's 846MB in total. > > > > > > The range is 24GB. In this case, most of the pages should only be written > > > once even if random for all these test durations, right? > > > > > > > Yes, I messed with load generator. > > The effective memory region was much smaller than 24GiB. > > I performed more testing (after fixing load generator), > > now with different memory sizes and different modes. > > > > +--------------+-----------------------------------------------+ > > | calc-time-ms | dirty rate MiB/s | > > | +----------------+---------------+--------------+ > > | | theoretical | page-sampling | dirty-bitmap | > > | | (at 3M wr/sec) | | | > > +--------------+----------------+---------------+--------------+ > > | 1GiB | > > +--------------+----------------+---------------+--------------+ > > | 100 | 6996 | 7100 | 3192 | > > | 200 | 4606 | 4660 | 2655 | > > | 300 | 3305 | 3280 | 2371 | > > | 400 | 2534 | 2525 | 2154 | > > | 500 | 2041 | 2044 | 1871 | > > | 750 | 1365 | 1341 | 1358 | > > | 1000 | 1024 | 1052 | 1025 | > > | 1500 | 683 | 678 | 684 | > > | 2000 | 512 | 507 | 513 | > > +--------------+----------------+---------------+--------------+ > > | 4GiB | > > +--------------+----------------+---------------+--------------+ > > | 100 | 10232 | 8880 | 4070 | > > | 200 | 8954 | 8049 | 3195 | > > | 300 | 7889 | 7193 | 2881 | > > | 400 | 6996 | 6530 | 2700 | > > | 500 | 6245 | 5772 | 2312 | > > | 750 | 4829 | 4586 | 2465 | > > | 1000 | 3865 | 3780 | 2178 | > > | 1500 | 2694 | 2633 | 2004 | > > | 2000 | 2041 | 2031 | 1789 | > > +--------------+----------------+---------------+--------------+ > > | 24GiB | > > +--------------+----------------+---------------+--------------+ > > | 100 | 11495 | 8640 | 5597 | > > | 200 | 11226 | 8616 | 3527 | > > | 300 | 10965 | 8386 | 2355 | > > | 400 | 10713 | 8370 | 2179 | > > | 500 | 10469 | 8196 | 2098 | > > | 750 | 9890 | 7885 | 2556 | > > | 1000 | 9354 | 7506 | 2084 | > > | 1500 | 8397 | 6944 | 2075 | > > | 2000 | 7574 | 6402 | 2062 | > > +--------------+----------------+---------------+--------------+ > > > > Theoretical values are computed according to the following formula: > > size * (1 - (1-(4096/size))^(time*wps)) / (time * 2^20), > > Thanks for more testings and the statistics. > > I had a feeling that this formula may or may not be accurate, but that's > less of an issue here. > > > where size is in bytes, time is in seconds, and wps is number of > > writes per second (I measured approximately 3000000 on my system). > > > > Theoretical values and values obtained with page-sampling are > > approximately close (<=25%). Dirty-bitmap values are much lower, > > likely because the majority of writes cause page faults. Even though > > dirty-bitmap logic is closer to what is happening during live > > migration, I still favor page sampling because the latter doesn't > > impact the performance of VM too much. > > Do you really use page samplings in production? I don't remember I > mentioned it anywhere before, but it will provide very wrong number when > the memory updates has a locality, afaik. For example, when 4G VM only has > 1G actively updated, the result can be 25% of reality iiuc, seeing that the > rest 3G didn't even change. It works only well with very distributed > memory updates. > Hmmm, such underestimation looks strange to me. I am willing to test page-sampling and see whether its quality can be improved. Do you have any specific suggestions on the application to use as a workload? If it turns out that page-sampling is not an option, then performance impact of the dirty-bitmap must be improved somehow. Maybe it makes sense to split memory into 4GiB chunks and measure dirty page rate independently for each of the chunks (without enabling page protections for memory outside of the currently processed chunk). But the downsides are that 1) total measurement time will increase proportionally by number of chunks 2) dirty page rate will be overestimated. But actually I am still hoping on page sampling. Since my goal is to roughly predict what can be migrated and what cannot be, I would prefer to keep predictor as lite as possible, even at the cost of (overestimation) error. > > > > Whether calc-time < 1sec is meaningful or not depends on the size > > of memory region with active writes. > > 1. If we have big VM and writes are evenly spread over the whole > > address space, then almost all writes will go into unique pages. > > In this case number of dirty pages will grow approximately > > linearly with time for small calc-time values. > > 2. But if memory region with active writes is small enough, then many > > writes will go to the same page, and the number of dirty pages > > will grow sublinearly even for small calc-time values. Note that > > the second scenario can happen even VM RAM is big. For example, > > imagine 128GiB VM with in-memory database that is used for reading. > > Although VM size is big, the memory region with active writes is > > just the application stack. > > No issue here to support small calc-time. I think as long as it'll be > worthwhile in any use case I'd be fine with it (rather than working for all > use cases). Not a super high bar to maintain the change. > > I copied Yong too, he just volunteered to look after the dirtyrate stuff. > > Thanks, > > -- > Peter Xu
On Tue, Aug 01, 2023 at 05:55:29PM +0300, gudkov.andrei@huawei.com wrote: > Hmmm, such underestimation looks strange to me. I am willing to test > page-sampling and see whether its quality can be improved. Do you have > any specific suggestions on the application to use as a workload? I could have had a wrong impression here, sorry. I played again with the page sampling approach, and that's actually pretty decent.. I had that impression probably based on the fact that by default we chose 2 pages out of 1000-ish (consider workloads having 100-ish memory updates where no sample page falls into it), and I do remember in some cases of my test setups quite some time ago, it shows totally wrong numbers. But maybe I had a wrong test, or wrong memory. Now thinking about it, for random/seq on not so small memory ranges, that seems to all work. For very small ones spread over it goes to the random case. > > If it turns out that page-sampling is not an option, then performance > impact of the dirty-bitmap must be improved somehow. Maybe it makes > sense to split memory into 4GiB chunks and measure dirty page rate > independently for each of the chunks (without enabling page > protections for memory outside of the currently processed chunk). > But the downsides are that 1) total measurement time will increase > proportionally by number of chunks 2) dirty page rate will be > overestimated. > > But actually I am still hoping on page sampling. Since my goal is to > roughly predict what can be migrated and what cannot be, I would prefer > to keep predictor as lite as possible, even at the cost of > (overestimation) error. Yes I also hope that works as you said. Thanks, -- Peter Xu
© 2016 - 2024 Red Hat, Inc.