[v17] support dirty restraint on vCPU

[PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

Posted by huangy81@chinatelecom.cn 3 years, 11 months ago

From: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>

Setup a negative feedback system when vCPU thread
handling KVM_EXIT_DIRTY_RING_FULL exit by introducing
throttle_us_per_full field in struct CPUState. Sleep
throttle_us_per_full microseconds to throttle vCPU
if dirtylimit is in service.

Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
Reviewed-by: Peter Xu <peterx@redhat.com>
---
 accel/kvm/kvm-all.c         |  19 ++-
 include/hw/core/cpu.h       |   6 +
 include/sysemu/dirtylimit.h |  15 +++
 softmmu/dirtylimit.c        | 291 ++++++++++++++++++++++++++++++++++++++++++++
 softmmu/trace-events        |   7 ++
 5 files changed, 337 insertions(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 8821d80..98e43e6 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -45,6 +45,7 @@
 #include "qemu/guest-random.h"
 #include "sysemu/hw_accel.h"
 #include "kvm-cpus.h"
+#include "sysemu/dirtylimit.h"
 
 #include "hw/boards.h"
 
@@ -476,6 +477,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
     cpu->kvm_state = s;
     cpu->vcpu_dirty = true;
     cpu->dirty_pages = 0;
+    cpu->throttle_us_per_full = 0;
 
     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
     if (mmap_size < 0) {
@@ -1469,6 +1471,11 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
          */
         sleep(1);
 
+        /* keep sleeping so that dirtylimit not be interfered by reaper */
+        if (dirtylimit_in_service()) {
+            continue;
+        }
+
         trace_kvm_dirty_ring_reaper("wakeup");
         r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
 
@@ -2964,8 +2971,18 @@ int kvm_cpu_exec(CPUState *cpu)
              */
             trace_kvm_dirty_ring_full(cpu->cpu_index);
             qemu_mutex_lock_iothread();
-            kvm_dirty_ring_reap(kvm_state, NULL);
+            /* We throttle vCPU by making it sleep once it exit from kernel
+             * due to dirty ring full. In the dirtylimit scenario, reaping
+             * all vCPUs after a single vCPU dirty ring get full result in
+             * the miss of sleep, so just reap the ring-fulled vCPU.
+             */
+            if (dirtylimit_in_service()) {
+                kvm_dirty_ring_reap(kvm_state, cpu);
+            } else {
+                kvm_dirty_ring_reap(kvm_state, NULL);
+            }
             qemu_mutex_unlock_iothread();
+            dirtylimit_vcpu_execute(cpu);
             ret = 0;
             break;
         case KVM_EXIT_SYSTEM_EVENT:
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 76ab3b8..dbeb31a 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -411,6 +411,12 @@ struct CPUState {
      */
     bool throttle_thread_scheduled;
 
+    /*
+     * Sleep throttle_us_per_full microseconds once dirty ring is full
+     * if dirty page rate limit is enabled.
+     */
+    int64_t throttle_us_per_full;
+
     bool ignore_memory_transaction_failures;
 
     /* Used for user-only emulation of prctl(PR_SET_UNALIGN). */
diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
index da459f0..8d2c1f3 100644
--- a/include/sysemu/dirtylimit.h
+++ b/include/sysemu/dirtylimit.h
@@ -19,4 +19,19 @@ void vcpu_dirty_rate_stat_start(void);
 void vcpu_dirty_rate_stat_stop(void);
 void vcpu_dirty_rate_stat_initialize(void);
 void vcpu_dirty_rate_stat_finalize(void);
+
+void dirtylimit_state_lock(void);
+void dirtylimit_state_unlock(void);
+void dirtylimit_state_initialize(void);
+void dirtylimit_state_finalize(void);
+bool dirtylimit_in_service(void);
+bool dirtylimit_vcpu_index_valid(int cpu_index);
+void dirtylimit_process(void);
+void dirtylimit_change(bool start);
+void dirtylimit_set_vcpu(int cpu_index,
+                         uint64_t quota,
+                         bool enable);
+void dirtylimit_set_all(uint64_t quota,
+                        bool enable);
+void dirtylimit_vcpu_execute(CPUState *cpu);
 #endif
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index 6102e8c..76d0b44 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -18,6 +18,26 @@
 #include "sysemu/dirtylimit.h"
 #include "exec/memory.h"
 #include "hw/boards.h"
+#include "sysemu/kvm.h"
+#include "trace.h"
+
+/*
+ * Dirtylimit stop working if dirty page rate error
+ * value less than DIRTYLIMIT_TOLERANCE_RANGE
+ */
+#define DIRTYLIMIT_TOLERANCE_RANGE  25  /* MB/s */
+/*
+ * Plus or minus vcpu sleep time linearly if dirty
+ * page rate error value percentage over
+ * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
+ * Otherwise, plus or minus a fixed vcpu sleep time.
+ */
+#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT     50
+/*
+ * Max vcpu sleep time percentage during a cycle
+ * composed of dirty ring full and sleep time.
+ */
+#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
 
 struct {
     VcpuStat stat;
@@ -25,6 +45,30 @@ struct {
     QemuThread thread;
 } *vcpu_dirty_rate_stat;
 
+typedef struct VcpuDirtyLimitState {
+    int cpu_index;
+    bool enabled;
+    /*
+     * Quota dirty page rate, unit is MB/s
+     * zero if not enabled.
+     */
+    uint64_t quota;
+} VcpuDirtyLimitState;
+
+struct {
+    VcpuDirtyLimitState *states;
+    /* Max cpus number configured by user */
+    int max_cpus;
+    /* Number of vcpu under dirtylimit */
+    int limited_nvcpu;
+} *dirtylimit_state;
+
+/* protect dirtylimit_state */
+static QemuMutex dirtylimit_mutex;
+
+/* dirtylimit thread quit if dirtylimit_quit is true */
+static bool dirtylimit_quit;
+
 static void vcpu_dirty_rate_stat_collect(void)
 {
     VcpuStat stat;
@@ -54,6 +98,9 @@ static void *vcpu_dirty_rate_stat_thread(void *opaque)
 
     while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
         vcpu_dirty_rate_stat_collect();
+        if (dirtylimit_in_service()) {
+            dirtylimit_process();
+        }
     }
 
     /* stop log sync */
@@ -86,9 +133,11 @@ void vcpu_dirty_rate_stat_start(void)
 void vcpu_dirty_rate_stat_stop(void)
 {
     qatomic_set(&vcpu_dirty_rate_stat->running, 0);
+    dirtylimit_state_unlock();
     qemu_mutex_unlock_iothread();
     qemu_thread_join(&vcpu_dirty_rate_stat->thread);
     qemu_mutex_lock_iothread();
+    dirtylimit_state_lock();
 }
 
 void vcpu_dirty_rate_stat_initialize(void)
@@ -114,3 +163,245 @@ void vcpu_dirty_rate_stat_finalize(void)
     free(vcpu_dirty_rate_stat);
     vcpu_dirty_rate_stat = NULL;
 }
+
+void dirtylimit_state_lock(void)
+{
+    qemu_mutex_lock(&dirtylimit_mutex);
+}
+
+void dirtylimit_state_unlock(void)
+{
+    qemu_mutex_unlock(&dirtylimit_mutex);
+}
+
+static void
+__attribute__((__constructor__)) dirtylimit_mutex_init(void)
+{
+    qemu_mutex_init(&dirtylimit_mutex);
+}
+
+static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index)
+{
+    return &dirtylimit_state->states[cpu_index];
+}
+
+void dirtylimit_state_initialize(void)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    int max_cpus = ms->smp.max_cpus;
+    int i;
+
+    dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
+
+    dirtylimit_state->states =
+            g_malloc0(sizeof(VcpuDirtyLimitState) * max_cpus);
+
+    for (i = 0; i < max_cpus; i++) {
+        dirtylimit_state->states[i].cpu_index = i;
+    }
+
+    dirtylimit_state->max_cpus = max_cpus;
+    trace_dirtylimit_state_initialize(max_cpus);
+}
+
+void dirtylimit_state_finalize(void)
+{
+    free(dirtylimit_state->states);
+    dirtylimit_state->states = NULL;
+
+    free(dirtylimit_state);
+    dirtylimit_state = NULL;
+
+    trace_dirtylimit_state_finalize();
+}
+
+bool dirtylimit_in_service(void)
+{
+    return !!dirtylimit_state;
+}
+
+bool dirtylimit_vcpu_index_valid(int cpu_index)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+
+    return !(cpu_index < 0 ||
+             cpu_index >= ms->smp.max_cpus);
+}
+
+static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
+{
+    static uint64_t max_dirtyrate;
+    uint32_t dirty_ring_size = kvm_dirty_ring_size();
+    uint64_t dirty_ring_size_meory_MB =
+        dirty_ring_size * TARGET_PAGE_SIZE >> 20;
+
+    if (max_dirtyrate < dirtyrate) {
+        max_dirtyrate = dirtyrate;
+    }
+
+    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
+}
+
+static inline bool dirtylimit_done(uint64_t quota,
+                                   uint64_t current)
+{
+    uint64_t min, max;
+
+    min = MIN(quota, current);
+    max = MAX(quota, current);
+
+    return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
+}
+
+static inline bool
+dirtylimit_need_linear_adjustment(uint64_t quota,
+                                  uint64_t current)
+{
+    uint64_t min, max;
+
+    min = MIN(quota, current);
+    max = MAX(quota, current);
+
+    return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
+}
+
+static void dirtylimit_set_throttle(CPUState *cpu,
+                                    uint64_t quota,
+                                    uint64_t current)
+{
+    int64_t ring_full_time_us = 0;
+    uint64_t sleep_pct = 0;
+    uint64_t throttle_us = 0;
+
+    if (current == 0) {
+        cpu->throttle_us_per_full = 0;
+        return;
+    }
+
+    ring_full_time_us = dirtylimit_dirty_ring_full_time(current);
+
+    if (dirtylimit_need_linear_adjustment(quota, current)) {
+        if (quota < current) {
+            sleep_pct = (current - quota) * 100 / current;
+            throttle_us =
+                ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
+            cpu->throttle_us_per_full += throttle_us;
+        } else {
+            sleep_pct = (quota - current) * 100 / quota;
+            throttle_us =
+                ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
+            cpu->throttle_us_per_full -= throttle_us;
+        }
+
+        trace_dirtylimit_throttle_pct(cpu->cpu_index,
+                                      sleep_pct,
+                                      throttle_us);
+    } else {
+        if (quota < current) {
+            cpu->throttle_us_per_full += ring_full_time_us / 10;
+        } else {
+            cpu->throttle_us_per_full -= ring_full_time_us / 10;
+        }
+    }
+
+    /*
+     * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario),
+     *       current dirty page rate may never reach the quota, we should stop
+     *       increasing sleep time?
+     */
+    cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
+        ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
+
+    cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0);
+}
+
+static void dirtylimit_adjust_throttle(CPUState *cpu)
+{
+    uint64_t quota = 0;
+    uint64_t current = 0;
+    int cpu_index = cpu->cpu_index;
+
+    quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
+    current = vcpu_dirty_rate_get(cpu_index);
+
+    if (!dirtylimit_done(quota, current)) {
+        dirtylimit_set_throttle(cpu, quota, current);
+    }
+
+    return;
+}
+
+void dirtylimit_process(void)
+{
+    CPUState *cpu;
+
+    if (!qatomic_read(&dirtylimit_quit)) {
+        dirtylimit_state_lock();
+
+        if (!dirtylimit_in_service()) {
+            dirtylimit_state_unlock();
+            return;
+        }
+
+        CPU_FOREACH(cpu) {
+            if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
+                continue;
+            }
+            dirtylimit_adjust_throttle(cpu);
+        }
+        dirtylimit_state_unlock();
+    }
+}
+
+void dirtylimit_change(bool start)
+{
+    if (start) {
+        qatomic_set(&dirtylimit_quit, 0);
+    } else {
+        qatomic_set(&dirtylimit_quit, 1);
+    }
+}
+
+void dirtylimit_set_vcpu(int cpu_index,
+                         uint64_t quota,
+                         bool enable)
+{
+    trace_dirtylimit_set_vcpu(cpu_index, quota);
+
+    if (enable) {
+        dirtylimit_state->states[cpu_index].quota = quota;
+        if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
+            dirtylimit_state->limited_nvcpu++;
+        }
+    } else {
+        dirtylimit_state->states[cpu_index].quota = 0;
+        if (dirtylimit_state->states[cpu_index].enabled) {
+            dirtylimit_state->limited_nvcpu--;
+        }
+    }
+
+    dirtylimit_state->states[cpu_index].enabled = enable;
+}
+
+void dirtylimit_set_all(uint64_t quota,
+                        bool enable)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    int max_cpus = ms->smp.max_cpus;
+    int i;
+
+    for (i = 0; i < max_cpus; i++) {
+        dirtylimit_set_vcpu(i, quota, enable);
+    }
+}
+
+void dirtylimit_vcpu_execute(CPUState *cpu)
+{
+    if (dirtylimit_in_service() &&
+        dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled &&
+        cpu->throttle_us_per_full) {
+        trace_dirtylimit_vcpu_execute(cpu->cpu_index,
+                cpu->throttle_us_per_full);
+        usleep(cpu->throttle_us_per_full);
+    }
+}
diff --git a/softmmu/trace-events b/softmmu/trace-events
index 9c88887..22606dc 100644
--- a/softmmu/trace-events
+++ b/softmmu/trace-events
@@ -31,3 +31,10 @@ runstate_set(int current_state, const char *current_state_str, int new_state, co
 system_wakeup_request(int reason) "reason=%d"
 qemu_system_shutdown_request(int reason) "reason=%d"
 qemu_system_powerdown_request(void) ""
+
+#dirtylimit.c
+dirtylimit_state_initialize(int max_cpus) "dirtylimit state initialize: max cpus %d"
+dirtylimit_state_finalize(void)
+dirtylimit_throttle_pct(int cpu_index, uint64_t pct, int64_t time_us) "CPU[%d] throttle percent: %" PRIu64 ", throttle adjust time %"PRIi64 " us"
+dirtylimit_set_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set dirty page rate limit %"PRIu64
+dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"PRIi64 " us"
-- 
1.8.3.1

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

Posted by manish.mishra 3 years, 8 months ago

Hi Hyman Huang,

I had few doubts regarding this patch series.

1. Why we choose for dirty rate limit per vcpu. I mean it becomes very hard for user to decide per

     vcpu dirty rate limit. For e.g. we have 1Gbps network and 10 vcpu vm. Now if someone wants to

     keep criteria for convergence as total dirty rate of VM should be lesser than half of available

     bandwidth. For this case to ensure convergence user has to give dirty rate limit per vcpu

     as 1Gbps/ 2 / 10 = 50Mbps. But assume then that VM has only 1 thread which is actively

     dirtying memory, in that case so much of available quota will be wasted. So would not it be

     better to use dirty rate limit control per VM instead of vcpu?

2. Also Here we are adaptively trying to adjust sleep time based on current obsered dirty rate and

     dirty rate limit. Can it be more forceful like. Assume we have dirty rate limit of 10pages

     per sec and auto-converge/ dirty rate limit was triggered at time 0. Now at any point of time assume

     at time 10 sec if number of pages dirtyed are more than 100pages we sleep for interpolated amount

     of time. Basically at every dirty ring exit we can check if current number of pages dirtied are more than

     what should be allowed by this time?

thanks

Manish Mishra

On 02/03/22 11:25 pm, huangy81@chinatelecom.cn wrote:
> From: Hyman Huang(黄勇)<huangy81@chinatelecom.cn>
>
> Setup a negative feedback system when vCPU thread
> handling KVM_EXIT_DIRTY_RING_FULL exit by introducing
> throttle_us_per_full field in struct CPUState. Sleep
> throttle_us_per_full microseconds to throttle vCPU
> if dirtylimit is in service.
>
> Signed-off-by: Hyman Huang(黄勇)<huangy81@chinatelecom.cn>
> Reviewed-by: Peter Xu<peterx@redhat.com>
> ---
>   accel/kvm/kvm-all.c         |  19 ++-
>   include/hw/core/cpu.h       |   6 +
>   include/sysemu/dirtylimit.h |  15 +++
>   softmmu/dirtylimit.c        | 291 ++++++++++++++++++++++++++++++++++++++++++++
>   softmmu/trace-events        |   7 ++
>   5 files changed, 337 insertions(+), 1 deletion(-)
>
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 8821d80..98e43e6 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -45,6 +45,7 @@
>   #include "qemu/guest-random.h"
>   #include "sysemu/hw_accel.h"
>   #include "kvm-cpus.h"
> +#include "sysemu/dirtylimit.h"
>   
>   #include "hw/boards.h"
>   
> @@ -476,6 +477,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
>       cpu->kvm_state = s;
>       cpu->vcpu_dirty = true;
>       cpu->dirty_pages = 0;
> +    cpu->throttle_us_per_full = 0;
>   
>       mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
>       if (mmap_size < 0) {
> @@ -1469,6 +1471,11 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
>            */
>           sleep(1);
>   
> +        /* keep sleeping so that dirtylimit not be interfered by reaper */
> +        if (dirtylimit_in_service()) {
> +            continue;
> +        }
> +
>           trace_kvm_dirty_ring_reaper("wakeup");
>           r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
>   
> @@ -2964,8 +2971,18 @@ int kvm_cpu_exec(CPUState *cpu)
>                */
>               trace_kvm_dirty_ring_full(cpu->cpu_index);
>               qemu_mutex_lock_iothread();
> -            kvm_dirty_ring_reap(kvm_state, NULL);
> +            /* We throttle vCPU by making it sleep once it exit from kernel
> +             * due to dirty ring full. In the dirtylimit scenario, reaping
> +             * all vCPUs after a single vCPU dirty ring get full result in
> +             * the miss of sleep, so just reap the ring-fulled vCPU.
> +             */
> +            if (dirtylimit_in_service()) {
> +                kvm_dirty_ring_reap(kvm_state, cpu);
> +            } else {
> +                kvm_dirty_ring_reap(kvm_state, NULL);
> +            }
>               qemu_mutex_unlock_iothread();
> +            dirtylimit_vcpu_execute(cpu);
>               ret = 0;
>               break;
>           case KVM_EXIT_SYSTEM_EVENT:
> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
> index 76ab3b8..dbeb31a 100644
> --- a/include/hw/core/cpu.h
> +++ b/include/hw/core/cpu.h
> @@ -411,6 +411,12 @@ struct CPUState {
>        */
>       bool throttle_thread_scheduled;
>   
> +    /*
> +     * Sleep throttle_us_per_full microseconds once dirty ring is full
> +     * if dirty page rate limit is enabled.
> +     */
> +    int64_t throttle_us_per_full;
> +
>       bool ignore_memory_transaction_failures;
>   
>       /* Used for user-only emulation of prctl(PR_SET_UNALIGN). */
> diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
> index da459f0..8d2c1f3 100644
> --- a/include/sysemu/dirtylimit.h
> +++ b/include/sysemu/dirtylimit.h
> @@ -19,4 +19,19 @@ void vcpu_dirty_rate_stat_start(void);
>   void vcpu_dirty_rate_stat_stop(void);
>   void vcpu_dirty_rate_stat_initialize(void);
>   void vcpu_dirty_rate_stat_finalize(void);
> +
> +void dirtylimit_state_lock(void);
> +void dirtylimit_state_unlock(void);
> +void dirtylimit_state_initialize(void);
> +void dirtylimit_state_finalize(void);
> +bool dirtylimit_in_service(void);
> +bool dirtylimit_vcpu_index_valid(int cpu_index);
> +void dirtylimit_process(void);
> +void dirtylimit_change(bool start);
> +void dirtylimit_set_vcpu(int cpu_index,
> +                         uint64_t quota,
> +                         bool enable);
> +void dirtylimit_set_all(uint64_t quota,
> +                        bool enable);
> +void dirtylimit_vcpu_execute(CPUState *cpu);
>   #endif
> diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
> index 6102e8c..76d0b44 100644
> --- a/softmmu/dirtylimit.c
> +++ b/softmmu/dirtylimit.c
> @@ -18,6 +18,26 @@
>   #include "sysemu/dirtylimit.h"
>   #include "exec/memory.h"
>   #include "hw/boards.h"
> +#include "sysemu/kvm.h"
> +#include "trace.h"
> +
> +/*
> + * Dirtylimit stop working if dirty page rate error
> + * value less than DIRTYLIMIT_TOLERANCE_RANGE
> + */
> +#define DIRTYLIMIT_TOLERANCE_RANGE  25  /* MB/s */
> +/*
> + * Plus or minus vcpu sleep time linearly if dirty
> + * page rate error value percentage over
> + * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
> + * Otherwise, plus or minus a fixed vcpu sleep time.
> + */
> +#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT     50
> +/*
> + * Max vcpu sleep time percentage during a cycle
> + * composed of dirty ring full and sleep time.
> + */
> +#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
>   
>   struct {
>       VcpuStat stat;
> @@ -25,6 +45,30 @@ struct {
>       QemuThread thread;
>   } *vcpu_dirty_rate_stat;
>   
> +typedef struct VcpuDirtyLimitState {
> +    int cpu_index;
> +    bool enabled;
> +    /*
> +     * Quota dirty page rate, unit is MB/s
> +     * zero if not enabled.
> +     */
> +    uint64_t quota;
> +} VcpuDirtyLimitState;
> +
> +struct {
> +    VcpuDirtyLimitState *states;
> +    /* Max cpus number configured by user */
> +    int max_cpus;
> +    /* Number of vcpu under dirtylimit */
> +    int limited_nvcpu;
> +} *dirtylimit_state;
> +
> +/* protect dirtylimit_state */
> +static QemuMutex dirtylimit_mutex;
> +
> +/* dirtylimit thread quit if dirtylimit_quit is true */
> +static bool dirtylimit_quit;
> +
>   static void vcpu_dirty_rate_stat_collect(void)
>   {
>       VcpuStat stat;
> @@ -54,6 +98,9 @@ static void *vcpu_dirty_rate_stat_thread(void *opaque)
>   
>       while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
>           vcpu_dirty_rate_stat_collect();
> +        if (dirtylimit_in_service()) {
> +            dirtylimit_process();
> +        }
>       }
>   
>       /* stop log sync */
> @@ -86,9 +133,11 @@ void vcpu_dirty_rate_stat_start(void)
>   void vcpu_dirty_rate_stat_stop(void)
>   {
>       qatomic_set(&vcpu_dirty_rate_stat->running, 0);
> +    dirtylimit_state_unlock();
>       qemu_mutex_unlock_iothread();
>       qemu_thread_join(&vcpu_dirty_rate_stat->thread);
>       qemu_mutex_lock_iothread();
> +    dirtylimit_state_lock();
>   }
>   
>   void vcpu_dirty_rate_stat_initialize(void)
> @@ -114,3 +163,245 @@ void vcpu_dirty_rate_stat_finalize(void)
>       free(vcpu_dirty_rate_stat);
>       vcpu_dirty_rate_stat = NULL;
>   }
> +
> +void dirtylimit_state_lock(void)
> +{
> +    qemu_mutex_lock(&dirtylimit_mutex);
> +}
> +
> +void dirtylimit_state_unlock(void)
> +{
> +    qemu_mutex_unlock(&dirtylimit_mutex);
> +}
> +
> +static void
> +__attribute__((__constructor__)) dirtylimit_mutex_init(void)
> +{
> +    qemu_mutex_init(&dirtylimit_mutex);
> +}
> +
> +static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index)
> +{
> +    return &dirtylimit_state->states[cpu_index];
> +}
> +
> +void dirtylimit_state_initialize(void)
> +{
> +    MachineState *ms = MACHINE(qdev_get_machine());
> +    int max_cpus = ms->smp.max_cpus;
> +    int i;
> +
> +    dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
> +
> +    dirtylimit_state->states =
> +            g_malloc0(sizeof(VcpuDirtyLimitState) * max_cpus);
> +
> +    for (i = 0; i < max_cpus; i++) {
> +        dirtylimit_state->states[i].cpu_index = i;
> +    }
> +
> +    dirtylimit_state->max_cpus = max_cpus;
> +    trace_dirtylimit_state_initialize(max_cpus);
> +}
> +
> +void dirtylimit_state_finalize(void)
> +{
> +    free(dirtylimit_state->states);
> +    dirtylimit_state->states = NULL;
> +
> +    free(dirtylimit_state);
> +    dirtylimit_state = NULL;
> +
> +    trace_dirtylimit_state_finalize();
> +}
> +
> +bool dirtylimit_in_service(void)
> +{
> +    return !!dirtylimit_state;
> +}
> +
> +bool dirtylimit_vcpu_index_valid(int cpu_index)
> +{
> +    MachineState *ms = MACHINE(qdev_get_machine());
> +
> +    return !(cpu_index < 0 ||
> +             cpu_index >= ms->smp.max_cpus);
> +}
> +
> +static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
> +{
> +    static uint64_t max_dirtyrate;
> +    uint32_t dirty_ring_size = kvm_dirty_ring_size();
> +    uint64_t dirty_ring_size_meory_MB =
> +        dirty_ring_size * TARGET_PAGE_SIZE >> 20;
> +
> +    if (max_dirtyrate < dirtyrate) {
> +        max_dirtyrate = dirtyrate;
> +    }
> +
> +    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
> +}
> +
> +static inline bool dirtylimit_done(uint64_t quota,
> +                                   uint64_t current)
> +{
> +    uint64_t min, max;
> +
> +    min = MIN(quota, current);
> +    max = MAX(quota, current);
> +
> +    return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
> +}
> +
> +static inline bool
> +dirtylimit_need_linear_adjustment(uint64_t quota,
> +                                  uint64_t current)
> +{
> +    uint64_t min, max;
> +
> +    min = MIN(quota, current);
> +    max = MAX(quota, current);
> +
> +    return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
> +}
> +
> +static void dirtylimit_set_throttle(CPUState *cpu,
> +                                    uint64_t quota,
> +                                    uint64_t current)
> +{
> +    int64_t ring_full_time_us = 0;
> +    uint64_t sleep_pct = 0;
> +    uint64_t throttle_us = 0;
> +
> +    if (current == 0) {
> +        cpu->throttle_us_per_full = 0;
> +        return;
> +    }
> +
> +    ring_full_time_us = dirtylimit_dirty_ring_full_time(current);
> +
> +    if (dirtylimit_need_linear_adjustment(quota, current)) {
> +        if (quota < current) {
> +            sleep_pct = (current - quota) * 100 / current;
> +            throttle_us =
> +                ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
> +            cpu->throttle_us_per_full += throttle_us;
> +        } else {
> +            sleep_pct = (quota - current) * 100 / quota;
> +            throttle_us =
> +                ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
> +            cpu->throttle_us_per_full -= throttle_us;
> +        }
> +
> +        trace_dirtylimit_throttle_pct(cpu->cpu_index,
> +                                      sleep_pct,
> +                                      throttle_us);
> +    } else {
> +        if (quota < current) {
> +            cpu->throttle_us_per_full += ring_full_time_us / 10;
> +        } else {
> +            cpu->throttle_us_per_full -= ring_full_time_us / 10;
> +        }
> +    }
> +
> +    /*
> +     * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario),
> +     *       current dirty page rate may never reach the quota, we should stop
> +     *       increasing sleep time?
> +     */
> +    cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
> +        ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
> +
> +    cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0);
> +}
> +
> +static void dirtylimit_adjust_throttle(CPUState *cpu)
> +{
> +    uint64_t quota = 0;
> +    uint64_t current = 0;
> +    int cpu_index = cpu->cpu_index;
> +
> +    quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
> +    current = vcpu_dirty_rate_get(cpu_index);
> +
> +    if (!dirtylimit_done(quota, current)) {
> +        dirtylimit_set_throttle(cpu, quota, current);
> +    }
> +
> +    return;
> +}
> +
> +void dirtylimit_process(void)
> +{
> +    CPUState *cpu;
> +
> +    if (!qatomic_read(&dirtylimit_quit)) {
> +        dirtylimit_state_lock();
> +
> +        if (!dirtylimit_in_service()) {
> +            dirtylimit_state_unlock();
> +            return;
> +        }
> +
> +        CPU_FOREACH(cpu) {
> +            if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
> +                continue;
> +            }
> +            dirtylimit_adjust_throttle(cpu);
> +        }
> +        dirtylimit_state_unlock();
> +    }
> +}
> +
> +void dirtylimit_change(bool start)
> +{
> +    if (start) {
> +        qatomic_set(&dirtylimit_quit, 0);
> +    } else {
> +        qatomic_set(&dirtylimit_quit, 1);
> +    }
> +}
> +
> +void dirtylimit_set_vcpu(int cpu_index,
> +                         uint64_t quota,
> +                         bool enable)
> +{
> +    trace_dirtylimit_set_vcpu(cpu_index, quota);
> +
> +    if (enable) {
> +        dirtylimit_state->states[cpu_index].quota = quota;
> +        if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
> +            dirtylimit_state->limited_nvcpu++;
> +        }
> +    } else {
> +        dirtylimit_state->states[cpu_index].quota = 0;
> +        if (dirtylimit_state->states[cpu_index].enabled) {
> +            dirtylimit_state->limited_nvcpu--;
> +        }
> +    }
> +
> +    dirtylimit_state->states[cpu_index].enabled = enable;
> +}
> +
> +void dirtylimit_set_all(uint64_t quota,
> +                        bool enable)
> +{
> +    MachineState *ms = MACHINE(qdev_get_machine());
> +    int max_cpus = ms->smp.max_cpus;
> +    int i;
> +
> +    for (i = 0; i < max_cpus; i++) {
> +        dirtylimit_set_vcpu(i, quota, enable);
> +    }
> +}
> +
> +void dirtylimit_vcpu_execute(CPUState *cpu)
> +{
> +    if (dirtylimit_in_service() &&
> +        dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled &&
> +        cpu->throttle_us_per_full) {
> +        trace_dirtylimit_vcpu_execute(cpu->cpu_index,
> +                cpu->throttle_us_per_full);
> +        usleep(cpu->throttle_us_per_full);
> +    }
> +}
> diff --git a/softmmu/trace-events b/softmmu/trace-events
> index 9c88887..22606dc 100644
> --- a/softmmu/trace-events
> +++ b/softmmu/trace-events
> @@ -31,3 +31,10 @@ runstate_set(int current_state, const char *current_state_str, int new_state, co
>   system_wakeup_request(int reason) "reason=%d"
>   qemu_system_shutdown_request(int reason) "reason=%d"
>   qemu_system_powerdown_request(void) ""
> +
> +#dirtylimit.c
> +dirtylimit_state_initialize(int max_cpus) "dirtylimit state initialize: max cpus %d"
> +dirtylimit_state_finalize(void)
> +dirtylimit_throttle_pct(int cpu_index, uint64_t pct, int64_t time_us) "CPU[%d] throttle percent: %" PRIu64 ", throttle adjust time %"PRIi64 " us"
> +dirtylimit_set_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set dirty page rate limit %"PRIu64
> +dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"PRIi64 " us"

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

Posted by Hyman Huang 3 years, 8 months ago

Thanks Manish for the comment, i'll give my explanation and any 
supplement are welcomed.

在 2022/5/17 1:13, manish.mishra 写道:
> Hi Hyman Huang,
> 
> I had few doubts regarding this patch series.
For the first point, m'm rudely guessing that you want to figure out how 
should we set the vcpu dirty limit correctly during live migration to 
make it convergent.

This can be achieved by set a single dirtylimit value on all vcpus, the 
value need not be equivalent of the half of available bandwidth so 
precisely since the dirtylimit is sufficient conditions of migration 
sucess, but not necessary condition.

We can set the dirtylimit as the minimum of what user can tolerate, in 
most case, migration can achieve convergent in advance and do the 
switchover with the real dirtyrate greater than dirtylimit. This can be 
implemented because Qemu will check the criteria every iteration, once 
it meet the condition, Qemu will do the switch over no matter what 
convergent algo is.
> 
> 1. Why we choose for dirty rate limit per vcpu. I mean it becomes very 
> hard for user to decide per
> 
>      vcpu dirty rate limit. For e.g. we have 1Gbps network and 10 vcpu 
> vm. Now if someone wants to
> 
>      keep criteria for convergence as total dirty rate of VM should be 
> lesser than half of available
> 
>      bandwidth. For this case to ensure convergence user has to give 
> dirty rate limit per vcpu
> 
>      as 1Gbps/ 2 / 10 = 50Mbps. But assume then that VM has only 1 
> thread which is actively
> 
>      dirtying memory, in that case so much of available quota will be 
> wasted.
This is a good and frequent question about dirtylimit, as mentioned 
above, throttle occurs only when dirty ring full and exit to user space.
A vcpu is set up with dirtylimit during live migration, but it does not 
dirty memory, it may never get throttled.
The dirtylimit only throttle those vcpu who dirty memory and dirtyrate 
greater then dirtylimit.

  So would not it be
> 
>      better to use dirty rate limit control per VM instead of vcpu?
> 
> 2. Also Here we are adaptively trying to adjust sleep time based on 
> current obsered dirty rate and
> 
>      dirty rate limit. Can it be more forceful like. Assume we have 
> dirty rate limit of 10pages
> 
>      per sec and auto-converge/ dirty rate limit was triggered at time 
> 0. Now at any point of time assume
> 
>      at time 10 sec if number of pages dirtyed are more than 100pages we 
> sleep for interpolated amount
> 
>      of time. Basically at every dirty ring exit we can check if current 
> number of pages dirtied are more than
> 
>      what should be allowed by this time?
Yes, indeed, but as memtioned above, if dirty ring exit, it give Qemu a 
hint that vcpu is dirting memory, we should check it.

I post the series of dirtylimit capability for RFC, may be it can help 
me to explain the usage of vcpu dirty limit, it can be found here:
https://lore.kernel.org/qemu-devel/cover.1652762652.git.huangy81@chinatelecom.cn/

Thanks,
Yong
> 
> thanks
> 
> Manish Mishra
> 
> On 02/03/22 11:25 pm, huangy81@chinatelecom.cn wrote:
>> From: Hyman Huang(黄勇)<huangy81@chinatelecom.cn>
>>
>> Setup a negative feedback system when vCPU thread
>> handling KVM_EXIT_DIRTY_RING_FULL exit by introducing
>> throttle_us_per_full field in struct CPUState. Sleep
>> throttle_us_per_full microseconds to throttle vCPU
>> if dirtylimit is in service.
>>
>> Signed-off-by: Hyman Huang(黄勇)<huangy81@chinatelecom.cn>
>> Reviewed-by: Peter Xu<peterx@redhat.com>
>> ---
>>   accel/kvm/kvm-all.c         |  19 ++-
>>   include/hw/core/cpu.h       |   6 +
>>   include/sysemu/dirtylimit.h |  15 +++
>>   softmmu/dirtylimit.c        | 291 ++++++++++++++++++++++++++++++++++++++++++++
>>   softmmu/trace-events        |   7 ++
>>   5 files changed, 337 insertions(+), 1 deletion(-)
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index 8821d80..98e43e6 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -45,6 +45,7 @@
>>   #include "qemu/guest-random.h"
>>   #include "sysemu/hw_accel.h"
>>   #include "kvm-cpus.h"
>> +#include "sysemu/dirtylimit.h"
>>   
>>   #include "hw/boards.h"
>>   
>> @@ -476,6 +477,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
>>       cpu->kvm_state = s;
>>       cpu->vcpu_dirty = true;
>>       cpu->dirty_pages = 0;
>> +    cpu->throttle_us_per_full = 0;
>>   
>>       mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
>>       if (mmap_size < 0) {
>> @@ -1469,6 +1471,11 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
>>            */
>>           sleep(1);
>>   
>> +        /* keep sleeping so that dirtylimit not be interfered by reaper */
>> +        if (dirtylimit_in_service()) {
>> +            continue;
>> +        }
>> +
>>           trace_kvm_dirty_ring_reaper("wakeup");
>>           r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
>>   
>> @@ -2964,8 +2971,18 @@ int kvm_cpu_exec(CPUState *cpu)
>>                */
>>               trace_kvm_dirty_ring_full(cpu->cpu_index);
>>               qemu_mutex_lock_iothread();
>> -            kvm_dirty_ring_reap(kvm_state, NULL);
>> +            /* We throttle vCPU by making it sleep once it exit from kernel
>> +             * due to dirty ring full. In the dirtylimit scenario, reaping
>> +             * all vCPUs after a single vCPU dirty ring get full result in
>> +             * the miss of sleep, so just reap the ring-fulled vCPU.
>> +             */
>> +            if (dirtylimit_in_service()) {
>> +                kvm_dirty_ring_reap(kvm_state, cpu);
>> +            } else {
>> +                kvm_dirty_ring_reap(kvm_state, NULL);
>> +            }
>>               qemu_mutex_unlock_iothread();
>> +            dirtylimit_vcpu_execute(cpu);
>>               ret = 0;
>>               break;
>>           case KVM_EXIT_SYSTEM_EVENT:
>> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
>> index 76ab3b8..dbeb31a 100644
>> --- a/include/hw/core/cpu.h
>> +++ b/include/hw/core/cpu.h
>> @@ -411,6 +411,12 @@ struct CPUState {
>>        */
>>       bool throttle_thread_scheduled;
>>   
>> +    /*
>> +     * Sleep throttle_us_per_full microseconds once dirty ring is full
>> +     * if dirty page rate limit is enabled.
>> +     */
>> +    int64_t throttle_us_per_full;
>> +
>>       bool ignore_memory_transaction_failures;
>>   
>>       /* Used for user-only emulation of prctl(PR_SET_UNALIGN). */
>> diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
>> index da459f0..8d2c1f3 100644
>> --- a/include/sysemu/dirtylimit.h
>> +++ b/include/sysemu/dirtylimit.h
>> @@ -19,4 +19,19 @@ void vcpu_dirty_rate_stat_start(void);
>>   void vcpu_dirty_rate_stat_stop(void);
>>   void vcpu_dirty_rate_stat_initialize(void);
>>   void vcpu_dirty_rate_stat_finalize(void);
>> +
>> +void dirtylimit_state_lock(void);
>> +void dirtylimit_state_unlock(void);
>> +void dirtylimit_state_initialize(void);
>> +void dirtylimit_state_finalize(void);
>> +bool dirtylimit_in_service(void);
>> +bool dirtylimit_vcpu_index_valid(int cpu_index);
>> +void dirtylimit_process(void);
>> +void dirtylimit_change(bool start);
>> +void dirtylimit_set_vcpu(int cpu_index,
>> +                         uint64_t quota,
>> +                         bool enable);
>> +void dirtylimit_set_all(uint64_t quota,
>> +                        bool enable);
>> +void dirtylimit_vcpu_execute(CPUState *cpu);
>>   #endif
>> diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
>> index 6102e8c..76d0b44 100644
>> --- a/softmmu/dirtylimit.c
>> +++ b/softmmu/dirtylimit.c
>> @@ -18,6 +18,26 @@
>>   #include "sysemu/dirtylimit.h"
>>   #include "exec/memory.h"
>>   #include "hw/boards.h"
>> +#include "sysemu/kvm.h"
>> +#include "trace.h"
>> +
>> +/*
>> + * Dirtylimit stop working if dirty page rate error
>> + * value less than DIRTYLIMIT_TOLERANCE_RANGE
>> + */
>> +#define DIRTYLIMIT_TOLERANCE_RANGE  25  /* MB/s */
>> +/*
>> + * Plus or minus vcpu sleep time linearly if dirty
>> + * page rate error value percentage over
>> + * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
>> + * Otherwise, plus or minus a fixed vcpu sleep time.
>> + */
>> +#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT     50
>> +/*
>> + * Max vcpu sleep time percentage during a cycle
>> + * composed of dirty ring full and sleep time.
>> + */
>> +#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
>>   
>>   struct {
>>       VcpuStat stat;
>> @@ -25,6 +45,30 @@ struct {
>>       QemuThread thread;
>>   } *vcpu_dirty_rate_stat;
>>   
>> +typedef struct VcpuDirtyLimitState {
>> +    int cpu_index;
>> +    bool enabled;
>> +    /*
>> +     * Quota dirty page rate, unit is MB/s
>> +     * zero if not enabled.
>> +     */
>> +    uint64_t quota;
>> +} VcpuDirtyLimitState;
>> +
>> +struct {
>> +    VcpuDirtyLimitState *states;
>> +    /* Max cpus number configured by user */
>> +    int max_cpus;
>> +    /* Number of vcpu under dirtylimit */
>> +    int limited_nvcpu;
>> +} *dirtylimit_state;
>> +
>> +/* protect dirtylimit_state */
>> +static QemuMutex dirtylimit_mutex;
>> +
>> +/* dirtylimit thread quit if dirtylimit_quit is true */
>> +static bool dirtylimit_quit;
>> +
>>   static void vcpu_dirty_rate_stat_collect(void)
>>   {
>>       VcpuStat stat;
>> @@ -54,6 +98,9 @@ static void *vcpu_dirty_rate_stat_thread(void *opaque)
>>   
>>       while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
>>           vcpu_dirty_rate_stat_collect();
>> +        if (dirtylimit_in_service()) {
>> +            dirtylimit_process();
>> +        }
>>       }
>>   
>>       /* stop log sync */
>> @@ -86,9 +133,11 @@ void vcpu_dirty_rate_stat_start(void)
>>   void vcpu_dirty_rate_stat_stop(void)
>>   {
>>       qatomic_set(&vcpu_dirty_rate_stat->running, 0);
>> +    dirtylimit_state_unlock();
>>       qemu_mutex_unlock_iothread();
>>       qemu_thread_join(&vcpu_dirty_rate_stat->thread);
>>       qemu_mutex_lock_iothread();
>> +    dirtylimit_state_lock();
>>   }
>>   
>>   void vcpu_dirty_rate_stat_initialize(void)
>> @@ -114,3 +163,245 @@ void vcpu_dirty_rate_stat_finalize(void)
>>       free(vcpu_dirty_rate_stat);
>>       vcpu_dirty_rate_stat = NULL;
>>   }
>> +
>> +void dirtylimit_state_lock(void)
>> +{
>> +    qemu_mutex_lock(&dirtylimit_mutex);
>> +}
>> +
>> +void dirtylimit_state_unlock(void)
>> +{
>> +    qemu_mutex_unlock(&dirtylimit_mutex);
>> +}
>> +
>> +static void
>> +__attribute__((__constructor__)) dirtylimit_mutex_init(void)
>> +{
>> +    qemu_mutex_init(&dirtylimit_mutex);
>> +}
>> +
>> +static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index)
>> +{
>> +    return &dirtylimit_state->states[cpu_index];
>> +}
>> +
>> +void dirtylimit_state_initialize(void)
>> +{
>> +    MachineState *ms = MACHINE(qdev_get_machine());
>> +    int max_cpus = ms->smp.max_cpus;
>> +    int i;
>> +
>> +    dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
>> +
>> +    dirtylimit_state->states =
>> +            g_malloc0(sizeof(VcpuDirtyLimitState) * max_cpus);
>> +
>> +    for (i = 0; i < max_cpus; i++) {
>> +        dirtylimit_state->states[i].cpu_index = i;
>> +    }
>> +
>> +    dirtylimit_state->max_cpus = max_cpus;
>> +    trace_dirtylimit_state_initialize(max_cpus);
>> +}
>> +
>> +void dirtylimit_state_finalize(void)
>> +{
>> +    free(dirtylimit_state->states);
>> +    dirtylimit_state->states = NULL;
>> +
>> +    free(dirtylimit_state);
>> +    dirtylimit_state = NULL;
>> +
>> +    trace_dirtylimit_state_finalize();
>> +}
>> +
>> +bool dirtylimit_in_service(void)
>> +{
>> +    return !!dirtylimit_state;
>> +}
>> +
>> +bool dirtylimit_vcpu_index_valid(int cpu_index)
>> +{
>> +    MachineState *ms = MACHINE(qdev_get_machine());
>> +
>> +    return !(cpu_index < 0 ||
>> +             cpu_index >= ms->smp.max_cpus);
>> +}
>> +
>> +static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
>> +{
>> +    static uint64_t max_dirtyrate;
>> +    uint32_t dirty_ring_size = kvm_dirty_ring_size();
>> +    uint64_t dirty_ring_size_meory_MB =
>> +        dirty_ring_size * TARGET_PAGE_SIZE >> 20;
>> +
>> +    if (max_dirtyrate < dirtyrate) {
>> +        max_dirtyrate = dirtyrate;
>> +    }
>> +
>> +    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
>> +}
>> +
>> +static inline bool dirtylimit_done(uint64_t quota,
>> +                                   uint64_t current)
>> +{
>> +    uint64_t min, max;
>> +
>> +    min = MIN(quota, current);
>> +    max = MAX(quota, current);
>> +
>> +    return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
>> +}
>> +
>> +static inline bool
>> +dirtylimit_need_linear_adjustment(uint64_t quota,
>> +                                  uint64_t current)
>> +{
>> +    uint64_t min, max;
>> +
>> +    min = MIN(quota, current);
>> +    max = MAX(quota, current);
>> +
>> +    return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
>> +}
>> +
>> +static void dirtylimit_set_throttle(CPUState *cpu,
>> +                                    uint64_t quota,
>> +                                    uint64_t current)
>> +{
>> +    int64_t ring_full_time_us = 0;
>> +    uint64_t sleep_pct = 0;
>> +    uint64_t throttle_us = 0;
>> +
>> +    if (current == 0) {
>> +        cpu->throttle_us_per_full = 0;
>> +        return;
>> +    }
>> +
>> +    ring_full_time_us = dirtylimit_dirty_ring_full_time(current);
>> +
>> +    if (dirtylimit_need_linear_adjustment(quota, current)) {
>> +        if (quota < current) {
>> +            sleep_pct = (current - quota) * 100 / current;
>> +            throttle_us =
>> +                ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
>> +            cpu->throttle_us_per_full += throttle_us;
>> +        } else {
>> +            sleep_pct = (quota - current) * 100 / quota;
>> +            throttle_us =
>> +                ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
>> +            cpu->throttle_us_per_full -= throttle_us;
>> +        }
>> +
>> +        trace_dirtylimit_throttle_pct(cpu->cpu_index,
>> +                                      sleep_pct,
>> +                                      throttle_us);
>> +    } else {
>> +        if (quota < current) {
>> +            cpu->throttle_us_per_full += ring_full_time_us / 10;
>> +        } else {
>> +            cpu->throttle_us_per_full -= ring_full_time_us / 10;
>> +        }
>> +    }
>> +
>> +    /*
>> +     * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario),
>> +     *       current dirty page rate may never reach the quota, we should stop
>> +     *       increasing sleep time?
>> +     */
>> +    cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
>> +        ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
>> +
>> +    cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0);
>> +}
>> +
>> +static void dirtylimit_adjust_throttle(CPUState *cpu)
>> +{
>> +    uint64_t quota = 0;
>> +    uint64_t current = 0;
>> +    int cpu_index = cpu->cpu_index;
>> +
>> +    quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
>> +    current = vcpu_dirty_rate_get(cpu_index);
>> +
>> +    if (!dirtylimit_done(quota, current)) {
>> +        dirtylimit_set_throttle(cpu, quota, current);
>> +    }
>> +
>> +    return;
>> +}
>> +
>> +void dirtylimit_process(void)
>> +{
>> +    CPUState *cpu;
>> +
>> +    if (!qatomic_read(&dirtylimit_quit)) {
>> +        dirtylimit_state_lock();
>> +
>> +        if (!dirtylimit_in_service()) {
>> +            dirtylimit_state_unlock();
>> +            return;
>> +        }
>> +
>> +        CPU_FOREACH(cpu) {
>> +            if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
>> +                continue;
>> +            }
>> +            dirtylimit_adjust_throttle(cpu);
>> +        }
>> +        dirtylimit_state_unlock();
>> +    }
>> +}
>> +
>> +void dirtylimit_change(bool start)
>> +{
>> +    if (start) {
>> +        qatomic_set(&dirtylimit_quit, 0);
>> +    } else {
>> +        qatomic_set(&dirtylimit_quit, 1);
>> +    }
>> +}
>> +
>> +void dirtylimit_set_vcpu(int cpu_index,
>> +                         uint64_t quota,
>> +                         bool enable)
>> +{
>> +    trace_dirtylimit_set_vcpu(cpu_index, quota);
>> +
>> +    if (enable) {
>> +        dirtylimit_state->states[cpu_index].quota = quota;
>> +        if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
>> +            dirtylimit_state->limited_nvcpu++;
>> +        }
>> +    } else {
>> +        dirtylimit_state->states[cpu_index].quota = 0;
>> +        if (dirtylimit_state->states[cpu_index].enabled) {
>> +            dirtylimit_state->limited_nvcpu--;
>> +        }
>> +    }
>> +
>> +    dirtylimit_state->states[cpu_index].enabled = enable;
>> +}
>> +
>> +void dirtylimit_set_all(uint64_t quota,
>> +                        bool enable)
>> +{
>> +    MachineState *ms = MACHINE(qdev_get_machine());
>> +    int max_cpus = ms->smp.max_cpus;
>> +    int i;
>> +
>> +    for (i = 0; i < max_cpus; i++) {
>> +        dirtylimit_set_vcpu(i, quota, enable);
>> +    }
>> +}
>> +
>> +void dirtylimit_vcpu_execute(CPUState *cpu)
>> +{
>> +    if (dirtylimit_in_service() &&
>> +        dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled &&
>> +        cpu->throttle_us_per_full) {
>> +        trace_dirtylimit_vcpu_execute(cpu->cpu_index,
>> +                cpu->throttle_us_per_full);
>> +        usleep(cpu->throttle_us_per_full);
>> +    }
>> +}
>> diff --git a/softmmu/trace-events b/softmmu/trace-events
>> index 9c88887..22606dc 100644
>> --- a/softmmu/trace-events
>> +++ b/softmmu/trace-events
>> @@ -31,3 +31,10 @@ runstate_set(int current_state, const char *current_state_str, int new_state, co
>>   system_wakeup_request(int reason) "reason=%d"
>>   qemu_system_shutdown_request(int reason) "reason=%d"
>>   qemu_system_powerdown_request(void) ""
>> +
>> +#dirtylimit.c
>> +dirtylimit_state_initialize(int max_cpus) "dirtylimit state initialize: max cpus %d"
>> +dirtylimit_state_finalize(void)
>> +dirtylimit_throttle_pct(int cpu_index, uint64_t pct, int64_t time_us) "CPU[%d] throttle percent: %" PRIu64 ", throttle adjust time %"PRIi64 " us"
>> +dirtylimit_set_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set dirty page rate limit %"PRIu64
>> +dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"PRIi64 " us"

-- 
Best regard

Hyman Huang(黄勇)

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

Posted by manish.mishra 3 years, 8 months ago

On 17/05/22 1:49 pm, Hyman Huang wrote:
> Thanks Manish for the comment, i'll give my explanation and any supplement are welcomed.
Really sorry for such late reply Hyman, this slipped my mind.
>
> 在 2022/5/17 1:13, manish.mishra 写道:
>> Hi Hyman Huang,
>>
>> I had few doubts regarding this patch series.
> For the first point, m'm rudely guessing that you want to figure out how should we set the vcpu dirty limit correctly during live migration to make it convergent.
>
> This can be achieved by set a single dirtylimit value on all vcpus, the value need not be equivalent of the half of available bandwidth so precisely since the dirtylimit is sufficient conditions of migration sucess, but not necessary condition.
>
> We can set the dirtylimit as the minimum of what user can tolerate, in most case, migration can achieve convergent in advance and do the switchover with the real dirtyrate greater than dirtylimit. This can be implemented because Qemu will check the criteria every iteration, once it meet the condition, Qemu will do the switch over no matter what convergent algo is.


Yes got it Hyman, my question was in direction that if we control dirty rate per vcpu, total dirty of VM become very unpredictable. For example if we have set dirty rate limit of each vcpu 50MBps for 10vcpu VM. Then total dirty rate of VM can be anywhere from 0-500MBps based on how many vcpu are active and how much. So if we had dirty rate limit control per VM it would have been much more predictable for user to use. I mean we can keep account of total dirty rate of VM and individual dirty rate and then assign throttle_sleep according to their weights to keep total dirty rate within limit of per vm dirty rate limit. But definately it can be targetted in future and should not be a blocker for now.

>>
>> 1. Why we choose for dirty rate limit per vcpu. I mean it becomes very hard for user to decide per
>>
>>      vcpu dirty rate limit. For e.g. we have 1Gbps network and 10 vcpu vm. Now if someone wants to
>>
>>      keep criteria for convergence as total dirty rate of VM should be lesser than half of available
>>
>>      bandwidth. For this case to ensure convergence user has to give dirty rate limit per vcpu
>>
>>      as 1Gbps/ 2 / 10 = 50Mbps. But assume then that VM has only 1 thread which is actively
>>
>>      dirtying memory, in that case so much of available quota will be wasted.
> This is a good and frequent question about dirtylimit, as mentioned above, throttle occurs only when dirty ring full and exit to user space.
> A vcpu is set up with dirtylimit during live migration, but it does not dirty memory, it may never get throttled.
> The dirtylimit only throttle those vcpu who dirty memory and dirtyrate greater then dirtylimit.
>
>  So would not it be
>>
>>      better to use dirty rate limit control per VM instead of vcpu?
>>
>> 2. Also Here we are adaptively trying to adjust sleep time based on current obsered dirty rate and
>>
>>      dirty rate limit. Can it be more forceful like. Assume we have dirty rate limit of 10pages
>>
>>      per sec and auto-converge/ dirty rate limit was triggered at time 0. Now at any point of time assume
>>
>>      at time 10 sec if number of pages dirtyed are more than 100pages we sleep for interpolated amount
>>
>>      of time. Basically at every dirty ring exit we can check if current number of pages dirtied are more than
>>
>>      what should be allowed by this time?
> Yes, indeed, but as memtioned above, if dirty ring exit, it give Qemu a hint that vcpu is dirting memory, we should check it.
>
> I post the series of dirtylimit capability for RFC, may be it can help me to explain the usage of vcpu dirty limit, it can be found here:
> https://urldefense.proofpoint.com/v2/url?u=https-3A__lore.kernel.org_qemu-2Ddevel_cover.1652762652.git.huangy81-40chinatelecom.cn_&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=eTtzbPA0FcwY1xwq3KPGhj-Nk5zT41MwAjVGH8a-yeQokG7j3pJxtGsFVCzMDH2X&s=iitKUTNXv8Xkvs-n-K1Aow8MxLEP64RdTXw532_oLIY&e=
> Thanks,
> Yong
thanks I read this.

Also i had few additional things in mind.

1. I see there is no limit on cpu->throttle_us_per_full. I see below line but then ring_full_time_us can be very high value so in some rare cases cpu->throttle_us_per_full can be very high. I know few database applications which can not tolerate continous sleep of more than 2 secs. I agree user should not configure very low dirty rate limit to avoid such situation but then user may not have enough idea of algorithm so better we keep out internal limits?

cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
        ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);

2. Also this algorithm only control or limits dirty rate by guest writes. There can be some memory dirtying done by virtio based devices which is accounted only at qemu level so may not be accounted through dirty rings so do we have plan for that in future? Those are not issue for auto-converge as it slows full VM but dirty rate limit only slows guest writes.

>>
>> thanks
>>
>> Manish Mishra
>>
>> On 02/03/22 11:25 pm, huangy81@chinatelecom.cn wrote:
>>> From: Hyman Huang(黄勇)<huangy81@chinatelecom.cn>
>>>
>>> Setup a negative feedback system when vCPU thread
>>> handling KVM_EXIT_DIRTY_RING_FULL exit by introducing
>>> throttle_us_per_full field in struct CPUState. Sleep
>>> throttle_us_per_full microseconds to throttle vCPU
>>> if dirtylimit is in service.
>>>
>>> Signed-off-by: Hyman Huang(黄勇)<huangy81@chinatelecom.cn>
>>> Reviewed-by: Peter Xu<peterx@redhat.com>
>>> ---
>>>   accel/kvm/kvm-all.c         |  19 ++-
>>>   include/hw/core/cpu.h       |   6 +
>>>   include/sysemu/dirtylimit.h |  15 +++
>>>   softmmu/dirtylimit.c        | 291 ++++++++++++++++++++++++++++++++++++++++++++
>>>   softmmu/trace-events        |   7 ++
>>>   5 files changed, 337 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>>> index 8821d80..98e43e6 100644
>>> --- a/accel/kvm/kvm-all.c
>>> +++ b/accel/kvm/kvm-all.c
>>> @@ -45,6 +45,7 @@
>>>   #include "qemu/guest-random.h"
>>>   #include "sysemu/hw_accel.h"
>>>   #include "kvm-cpus.h"
>>> +#include "sysemu/dirtylimit.h"
>>>     #include "hw/boards.h"
>>>   @@ -476,6 +477,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
>>>       cpu->kvm_state = s;
>>>       cpu->vcpu_dirty = true;
>>>       cpu->dirty_pages = 0;
>>> +    cpu->throttle_us_per_full = 0;
>>>         mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
>>>       if (mmap_size < 0) {
>>> @@ -1469,6 +1471,11 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
>>>            */
>>>           sleep(1);
>>>   +        /* keep sleeping so that dirtylimit not be interfered by reaper */
>>> +        if (dirtylimit_in_service()) {
>>> +            continue;
>>> +        }
>>> +
>>>           trace_kvm_dirty_ring_reaper("wakeup");
>>>           r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
>>>   @@ -2964,8 +2971,18 @@ int kvm_cpu_exec(CPUState *cpu)
>>>                */
>>>               trace_kvm_dirty_ring_full(cpu->cpu_index);
>>>               qemu_mutex_lock_iothread();
>>> -            kvm_dirty_ring_reap(kvm_state, NULL);
>>> +            /* We throttle vCPU by making it sleep once it exit from kernel
>>> +             * due to dirty ring full. In the dirtylimit scenario, reaping
>>> +             * all vCPUs after a single vCPU dirty ring get full result in
>>> +             * the miss of sleep, so just reap the ring-fulled vCPU.
>>> +             */
>>> +            if (dirtylimit_in_service()) {
>>> +                kvm_dirty_ring_reap(kvm_state, cpu);
>>> +            } else {
>>> +                kvm_dirty_ring_reap(kvm_state, NULL);
>>> +            }
>>>               qemu_mutex_unlock_iothread();
>>> +            dirtylimit_vcpu_execute(cpu);
>>>               ret = 0;
>>>               break;
>>>           case KVM_EXIT_SYSTEM_EVENT:
>>> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
>>> index 76ab3b8..dbeb31a 100644
>>> --- a/include/hw/core/cpu.h
>>> +++ b/include/hw/core/cpu.h
>>> @@ -411,6 +411,12 @@ struct CPUState {
>>>        */
>>>       bool throttle_thread_scheduled;
>>>   +    /*
>>> +     * Sleep throttle_us_per_full microseconds once dirty ring is full
>>> +     * if dirty page rate limit is enabled.
>>> +     */
>>> +    int64_t throttle_us_per_full;
>>> +
>>>       bool ignore_memory_transaction_failures;
>>>         /* Used for user-only emulation of prctl(PR_SET_UNALIGN). */
>>> diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
>>> index da459f0..8d2c1f3 100644
>>> --- a/include/sysemu/dirtylimit.h
>>> +++ b/include/sysemu/dirtylimit.h
>>> @@ -19,4 +19,19 @@ void vcpu_dirty_rate_stat_start(void);
>>>   void vcpu_dirty_rate_stat_stop(void);
>>>   void vcpu_dirty_rate_stat_initialize(void);
>>>   void vcpu_dirty_rate_stat_finalize(void);
>>> +
>>> +void dirtylimit_state_lock(void);
>>> +void dirtylimit_state_unlock(void);
>>> +void dirtylimit_state_initialize(void);
>>> +void dirtylimit_state_finalize(void);
>>> +bool dirtylimit_in_service(void);
>>> +bool dirtylimit_vcpu_index_valid(int cpu_index);
>>> +void dirtylimit_process(void);
>>> +void dirtylimit_change(bool start);
>>> +void dirtylimit_set_vcpu(int cpu_index,
>>> +                         uint64_t quota,
>>> +                         bool enable);
>>> +void dirtylimit_set_all(uint64_t quota,
>>> +                        bool enable);
>>> +void dirtylimit_vcpu_execute(CPUState *cpu);
>>>   #endif
>>> diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
>>> index 6102e8c..76d0b44 100644
>>> --- a/softmmu/dirtylimit.c
>>> +++ b/softmmu/dirtylimit.c
>>> @@ -18,6 +18,26 @@
>>>   #include "sysemu/dirtylimit.h"
>>>   #include "exec/memory.h"
>>>   #include "hw/boards.h"
>>> +#include "sysemu/kvm.h"
>>> +#include "trace.h"
>>> +
>>> +/*
>>> + * Dirtylimit stop working if dirty page rate error
>>> + * value less than DIRTYLIMIT_TOLERANCE_RANGE
>>> + */
>>> +#define DIRTYLIMIT_TOLERANCE_RANGE  25  /* MB/s */
>>> +/*
>>> + * Plus or minus vcpu sleep time linearly if dirty
>>> + * page rate error value percentage over
>>> + * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
>>> + * Otherwise, plus or minus a fixed vcpu sleep time.
>>> + */
>>> +#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT     50
>>> +/*
>>> + * Max vcpu sleep time percentage during a cycle
>>> + * composed of dirty ring full and sleep time.
>>> + */
>>> +#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
>>>     struct {
>>>       VcpuStat stat;
>>> @@ -25,6 +45,30 @@ struct {
>>>       QemuThread thread;
>>>   } *vcpu_dirty_rate_stat;
>>>   +typedef struct VcpuDirtyLimitState {
>>> +    int cpu_index;
>>> +    bool enabled;
>>> +    /*
>>> +     * Quota dirty page rate, unit is MB/s
>>> +     * zero if not enabled.
>>> +     */
>>> +    uint64_t quota;
>>> +} VcpuDirtyLimitState;
>>> +
>>> +struct {
>>> +    VcpuDirtyLimitState *states;
>>> +    /* Max cpus number configured by user */
>>> +    int max_cpus;
>>> +    /* Number of vcpu under dirtylimit */
>>> +    int limited_nvcpu;
>>> +} *dirtylimit_state;
>>> +
>>> +/* protect dirtylimit_state */
>>> +static QemuMutex dirtylimit_mutex;
>>> +
>>> +/* dirtylimit thread quit if dirtylimit_quit is true */
>>> +static bool dirtylimit_quit;
>>> +
>>>   static void vcpu_dirty_rate_stat_collect(void)
>>>   {
>>>       VcpuStat stat;
>>> @@ -54,6 +98,9 @@ static void *vcpu_dirty_rate_stat_thread(void *opaque)
>>>         while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
>>>           vcpu_dirty_rate_stat_collect();
>>> +        if (dirtylimit_in_service()) {
>>> +            dirtylimit_process();
>>> +        }
>>>       }
>>>         /* stop log sync */
>>> @@ -86,9 +133,11 @@ void vcpu_dirty_rate_stat_start(void)
>>>   void vcpu_dirty_rate_stat_stop(void)
>>>   {
>>>       qatomic_set(&vcpu_dirty_rate_stat->running, 0);
>>> +    dirtylimit_state_unlock();
>>>       qemu_mutex_unlock_iothread();
>>>       qemu_thread_join(&vcpu_dirty_rate_stat->thread);
>>>       qemu_mutex_lock_iothread();
>>> +    dirtylimit_state_lock();
>>>   }
>>>     void vcpu_dirty_rate_stat_initialize(void)
>>> @@ -114,3 +163,245 @@ void vcpu_dirty_rate_stat_finalize(void)
>>>       free(vcpu_dirty_rate_stat);
>>>       vcpu_dirty_rate_stat = NULL;
>>>   }
>>> +
>>> +void dirtylimit_state_lock(void)
>>> +{
>>> +    qemu_mutex_lock(&dirtylimit_mutex);
>>> +}
>>> +
>>> +void dirtylimit_state_unlock(void)
>>> +{
>>> +    qemu_mutex_unlock(&dirtylimit_mutex);
>>> +}
>>> +
>>> +static void
>>> +__attribute__((__constructor__)) dirtylimit_mutex_init(void)
>>> +{
>>> +    qemu_mutex_init(&dirtylimit_mutex);
>>> +}
>>> +
>>> +static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index)
>>> +{
>>> +    return &dirtylimit_state->states[cpu_index];
>>> +}
>>> +
>>> +void dirtylimit_state_initialize(void)
>>> +{
>>> +    MachineState *ms = MACHINE(qdev_get_machine());
>>> +    int max_cpus = ms->smp.max_cpus;
>>> +    int i;
>>> +
>>> +    dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
>>> +
>>> +    dirtylimit_state->states =
>>> +            g_malloc0(sizeof(VcpuDirtyLimitState) * max_cpus);
>>> +
>>> +    for (i = 0; i < max_cpus; i++) {
>>> +        dirtylimit_state->states[i].cpu_index = i;
>>> +    }
>>> +
>>> +    dirtylimit_state->max_cpus = max_cpus;
>>> +    trace_dirtylimit_state_initialize(max_cpus);
>>> +}
>>> +
>>> +void dirtylimit_state_finalize(void)
>>> +{
>>> +    free(dirtylimit_state->states);
>>> +    dirtylimit_state->states = NULL;
>>> +
>>> +    free(dirtylimit_state);
>>> +    dirtylimit_state = NULL;
>>> +
>>> +    trace_dirtylimit_state_finalize();
>>> +}
>>> +
>>> +bool dirtylimit_in_service(void)
>>> +{
>>> +    return !!dirtylimit_state;
>>> +}
>>> +
>>> +bool dirtylimit_vcpu_index_valid(int cpu_index)
>>> +{
>>> +    MachineState *ms = MACHINE(qdev_get_machine());
>>> +
>>> +    return !(cpu_index < 0 ||
>>> +             cpu_index >= ms->smp.max_cpus);
>>> +}
>>> +
>>> +static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
>>> +{
>>> +    static uint64_t max_dirtyrate;
>>> +    uint32_t dirty_ring_size = kvm_dirty_ring_size();
>>> +    uint64_t dirty_ring_size_meory_MB =
>>> +        dirty_ring_size * TARGET_PAGE_SIZE >> 20;
>>> +
>>> +    if (max_dirtyrate < dirtyrate) {
>>> +        max_dirtyrate = dirtyrate;
>>> +    }
>>> +
>>> +    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
>>> +}
>>> +
>>> +static inline bool dirtylimit_done(uint64_t quota,
>>> +                                   uint64_t current)
>>> +{
>>> +    uint64_t min, max;
>>> +
>>> +    min = MIN(quota, current);
>>> +    max = MAX(quota, current);
>>> +
>>> +    return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
>>> +}
>>> +
>>> +static inline bool
>>> +dirtylimit_need_linear_adjustment(uint64_t quota,
>>> +                                  uint64_t current)
>>> +{
>>> +    uint64_t min, max;
>>> +
>>> +    min = MIN(quota, current);
>>> +    max = MAX(quota, current);
>>> +
>>> +    return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
>>> +}
>>> +
>>> +static void dirtylimit_set_throttle(CPUState *cpu,
>>> +                                    uint64_t quota,
>>> +                                    uint64_t current)
>>> +{
>>> +    int64_t ring_full_time_us = 0;
>>> +    uint64_t sleep_pct = 0;
>>> +    uint64_t throttle_us = 0;
>>> +
>>> +    if (current == 0) {
>>> +        cpu->throttle_us_per_full = 0;
>>> +        return;
>>> +    }
>>> +
>>> +    ring_full_time_us = dirtylimit_dirty_ring_full_time(current);
>>> +
>>> +    if (dirtylimit_need_linear_adjustment(quota, current)) {
>>> +        if (quota < current) {
>>> +            sleep_pct = (current - quota) * 100 / current;
>>> +            throttle_us =
>>> +                ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
>>> +            cpu->throttle_us_per_full += throttle_us;
>>> +        } else {
>>> +            sleep_pct = (quota - current) * 100 / quota;
>>> +            throttle_us =
>>> +                ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
>>> +            cpu->throttle_us_per_full -= throttle_us;
>>> +        }
>>> +
>>> +        trace_dirtylimit_throttle_pct(cpu->cpu_index,
>>> +                                      sleep_pct,
>>> +                                      throttle_us);
>>> +    } else {
>>> +        if (quota < current) {
>>> +            cpu->throttle_us_per_full += ring_full_time_us / 10;
>>> +        } else {
>>> +            cpu->throttle_us_per_full -= ring_full_time_us / 10;
>>> +        }
>>> +    }
>>> +
>>> +    /*
>>> +     * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario),
>>> +     *       current dirty page rate may never reach the quota, we should stop
>>> +     *       increasing sleep time?
>>> +     */
>>> +    cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
>>> +        ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
>>> +
>>> +    cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0);
>>> +}
>>> +
>>> +static void dirtylimit_adjust_throttle(CPUState *cpu)
>>> +{
>>> +    uint64_t quota = 0;
>>> +    uint64_t current = 0;
>>> +    int cpu_index = cpu->cpu_index;
>>> +
>>> +    quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
>>> +    current = vcpu_dirty_rate_get(cpu_index);
>>> +
>>> +    if (!dirtylimit_done(quota, current)) {
>>> +        dirtylimit_set_throttle(cpu, quota, current);
>>> +    }
>>> +
>>> +    return;
>>> +}
>>> +
>>> +void dirtylimit_process(void)
>>> +{
>>> +    CPUState *cpu;
>>> +
>>> +    if (!qatomic_read(&dirtylimit_quit)) {
>>> +        dirtylimit_state_lock();
>>> +
>>> +        if (!dirtylimit_in_service()) {
>>> +            dirtylimit_state_unlock();
>>> +            return;
>>> +        }
>>> +
>>> +        CPU_FOREACH(cpu) {
>>> +            if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
>>> +                continue;
>>> +            }
>>> +            dirtylimit_adjust_throttle(cpu);
>>> +        }
>>> +        dirtylimit_state_unlock();
>>> +    }
>>> +}
>>> +
>>> +void dirtylimit_change(bool start)
>>> +{
>>> +    if (start) {
>>> +        qatomic_set(&dirtylimit_quit, 0);
>>> +    } else {
>>> +        qatomic_set(&dirtylimit_quit, 1);
>>> +    }
>>> +}
>>> +
>>> +void dirtylimit_set_vcpu(int cpu_index,
>>> +                         uint64_t quota,
>>> +                         bool enable)
>>> +{
>>> +    trace_dirtylimit_set_vcpu(cpu_index, quota);
>>> +
>>> +    if (enable) {
>>> +        dirtylimit_state->states[cpu_index].quota = quota;
>>> +        if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
>>> +            dirtylimit_state->limited_nvcpu++;
>>> +        }
>>> +    } else {
>>> +        dirtylimit_state->states[cpu_index].quota = 0;
>>> +        if (dirtylimit_state->states[cpu_index].enabled) {
>>> +            dirtylimit_state->limited_nvcpu--;
>>> +        }
>>> +    }
>>> +
>>> +    dirtylimit_state->states[cpu_index].enabled = enable;
>>> +}
>>> +
>>> +void dirtylimit_set_all(uint64_t quota,
>>> +                        bool enable)
>>> +{
>>> +    MachineState *ms = MACHINE(qdev_get_machine());
>>> +    int max_cpus = ms->smp.max_cpus;
>>> +    int i;
>>> +
>>> +    for (i = 0; i < max_cpus; i++) {
>>> +        dirtylimit_set_vcpu(i, quota, enable);
>>> +    }
>>> +}
>>> +
>>> +void dirtylimit_vcpu_execute(CPUState *cpu)
>>> +{
>>> +    if (dirtylimit_in_service() &&
>>> + dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled &&
>>> +        cpu->throttle_us_per_full) {
>>> +        trace_dirtylimit_vcpu_execute(cpu->cpu_index,
>>> +                cpu->throttle_us_per_full);
>>> +        usleep(cpu->throttle_us_per_full);
>>> +    }
>>> +}
>>> diff --git a/softmmu/trace-events b/softmmu/trace-events
>>> index 9c88887..22606dc 100644
>>> --- a/softmmu/trace-events
>>> +++ b/softmmu/trace-events
>>> @@ -31,3 +31,10 @@ runstate_set(int current_state, const char *current_state_str, int new_state, co
>>>   system_wakeup_request(int reason) "reason=%d"
>>>   qemu_system_shutdown_request(int reason) "reason=%d"
>>>   qemu_system_powerdown_request(void) ""
>>> +
>>> +#dirtylimit.c
>>> +dirtylimit_state_initialize(int max_cpus) "dirtylimit state initialize: max cpus %d"
>>> +dirtylimit_state_finalize(void)
>>> +dirtylimit_throttle_pct(int cpu_index, uint64_t pct, int64_t time_us) "CPU[%d] throttle percent: %" PRIu64 ", throttle adjust time %"PRIi64 " us"
>>> +dirtylimit_set_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set dirty page rate limit %"PRIu64
>>> +dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"PRIi64 " us"
>

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

Posted by Hyman Huang 3 years, 8 months ago


在 2022/5/24 0:44, manish.mishra 写道:
> 
> On 17/05/22 1:49 pm, Hyman Huang wrote:
>> Thanks Manish for the comment, i'll give my explanation and any 
>> supplement are welcomed.
> Really sorry for such late reply Hyman, this slipped my mind.
>>
>> 在 2022/5/17 1:13, manish.mishra 写道:
>>> Hi Hyman Huang,
>>>
>>> I had few doubts regarding this patch series.
>> For the first point, m'm rudely guessing that you want to figure out 
>> how should we set the vcpu dirty limit correctly during live migration 
>> to make it convergent.
>>
>> This can be achieved by set a single dirtylimit value on all vcpus, 
>> the value need not be equivalent of the half of available bandwidth so 
>> precisely since the dirtylimit is sufficient conditions of migration 
>> sucess, but not necessary condition.
>>
>> We can set the dirtylimit as the minimum of what user can tolerate, in 
>> most case, migration can achieve convergent in advance and do the 
>> switchover with the real dirtyrate greater than dirtylimit. This can 
>> be implemented because Qemu will check the criteria every iteration, 
>> once it meet the condition, Qemu will do the switch over no matter 
>> what convergent algo is.
> 
> 
> Yes got it Hyman, my question was in direction that if we control dirty 
> rate per vcpu, total dirty of VM become very unpredictable. For example 
> if we have set dirty rate limit of each vcpu 50MBps for 10vcpu VM. Then 
> total dirty rate of VM can be anywhere from 0-500MBps based on how many 
> vcpu are active and how much. So if we had dirty rate limit control per 
> VM it would have been much more predictable for user to use. I mean we 
> can keep account of total dirty rate of VM and individual dirty rate and 
> then assign throttle_sleep according to their weights to keep total 
> dirty rate within limit of per vm dirty rate limit. But definately it 
> can be targetted in future and should not be a blocker for now.
I got it. This patchset doesn't aim to how to control the dirty page 
rate precisely from my view, but to provide a method to limit dirty page 
rate. So the two views don't conflict. Dirtylimit focuse on limitting 
"write-vcpu" and introducing quota dirty page rate just to make the 
throttle algo has homogeneous metric parameters. As to how to control
the dirty page rate precisely, there may be a fresh new algo to 
implement it(as you see, monitor the vm dirty page rate, stat and 
calculate weights, assign throttle_sleep and so on)
> 
>>>
>>> 1. Why we choose for dirty rate limit per vcpu. I mean it becomes 
>>> very hard for user to decide per
>>>
>>>      vcpu dirty rate limit. For e.g. we have 1Gbps network and 10 
>>> vcpu vm. Now if someone wants to
>>>
>>>      keep criteria for convergence as total dirty rate of VM should 
>>> be lesser than half of available
>>>
>>>      bandwidth. For this case to ensure convergence user has to give 
>>> dirty rate limit per vcpu
>>>
>>>      as 1Gbps/ 2 / 10 = 50Mbps. But assume then that VM has only 1 
>>> thread which is actively
>>>
>>>      dirtying memory, in that case so much of available quota will be 
>>> wasted.
>> This is a good and frequent question about dirtylimit, as mentioned 
>> above, throttle occurs only when dirty ring full and exit to user space.
>> A vcpu is set up with dirtylimit during live migration, but it does 
>> not dirty memory, it may never get throttled.
>> The dirtylimit only throttle those vcpu who dirty memory and dirtyrate 
>> greater then dirtylimit.
>>
>>  So would not it be
>>>
>>>      better to use dirty rate limit control per VM instead of vcpu?
>>>
>>> 2. Also Here we are adaptively trying to adjust sleep time based on 
>>> current obsered dirty rate and
>>>
>>>      dirty rate limit. Can it be more forceful like. Assume we have 
>>> dirty rate limit of 10pages
>>>
>>>      per sec and auto-converge/ dirty rate limit was triggered at 
>>> time 0. Now at any point of time assume
>>>
>>>      at time 10 sec if number of pages dirtyed are more than 100pages 
>>> we sleep for interpolated amount
>>>
>>>      of time. Basically at every dirty ring exit we can check if 
>>> current number of pages dirtied are more than
>>>
>>>      what should be allowed by this time?
>> Yes, indeed, but as memtioned above, if dirty ring exit, it give Qemu 
>> a hint that vcpu is dirting memory, we should check it.
>>
>> I post the series of dirtylimit capability for RFC, may be it can help 
>> me to explain the usage of vcpu dirty limit, it can be found here:
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__lore.kernel.org_qemu-2Ddevel_cover.1652762652.git.huangy81-40chinatelecom.cn_&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=eTtzbPA0FcwY1xwq3KPGhj-Nk5zT41MwAjVGH8a-yeQokG7j3pJxtGsFVCzMDH2X&s=iitKUTNXv8Xkvs-n-K1Aow8MxLEP64RdTXw532_oLIY&e= 
>>
>> Thanks,
>> Yong
> thanks I read this.
> 
> Also i had few additional things in mind.
> 
> 1. I see there is no limit on cpu->throttle_us_per_full. I see below 
> line but then ring_full_time_us can be very high value so in some rare 
> cases cpu->throttle_us_per_full can be very high. I know few database 
> applications which can not tolerate continous sleep of more than 2 secs. 
> I agree user should not configure very low dirty rate limit to avoid 
> such situation but then user may not have enough idea of algorithm so 
> better we keep out internal limits?
> 
> cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
>         ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
The ring_full_time_us is affected by two factors, dirty ring size and 
max dirty page rate, as the following formula:

ring_full_time_us  = dirty ring size * page size / max_dirtyrate;

Maximum of dirty ring size is 65536, so from this perspective, the 
ring_full_time_us has limit but not hard-coded via macro。The scenario 
that ring_full_time_us is high only occur when dirty ring size 
configured with max value 65536。I configured the max dirty ring size 
and open the dirtylimit_vcpu_execute  trace event, it show that 
throttle_us_per_full <= 1s in my test server environment。Indeed, i 
agree that ring_full_time_us can be very high in some case, but adding a 
perfect limit also need a lot experience. I suggest that we can leave 
the code untouched and keep the algo as simple as possible until we do 
make sure that application is affected.

> 
> 2. Also this algorithm only control or limits dirty rate by guest 
> writes. There can be some memory dirtying done by virtio based devices 
> which is accounted only at qemu level so may not be accounted through 
> dirty rings so do we have plan for that in future? Those are not issue 
> for auto-converge as it slows full VM but dirty rate limit only slows 
> guest writes.
> 
 From the migration point of view, time spent on migrating memory is far 
greater than migrating devices emulated by qemu. I think we can do that 
when migrating device costs the same magnitude time as migrating memory.

As to auto-converge, it throttle vcpu by kicking it and force it to 
sleep periodically. The two seems has no much difference from the 
perspective of internal method but the auto-converge is kind of 
"offensive" when doing restraint. I'll read the auto-converge 
implementation code and figure out the problem you point out.

Thanks

>>>
>>> thanks
>>>
>>> Manish Mishra
>>>
>>> On 02/03/22 11:25 pm, huangy81@chinatelecom.cn wrote:
>>>> From: Hyman Huang(黄勇)<huangy81@chinatelecom.cn>
>>>>
>>>> Setup a negative feedback system when vCPU thread
>>>> handling KVM_EXIT_DIRTY_RING_FULL exit by introducing
>>>> throttle_us_per_full field in struct CPUState. Sleep
>>>> throttle_us_per_full microseconds to throttle vCPU
>>>> if dirtylimit is in service.
>>>>
>>>> Signed-off-by: Hyman Huang(黄勇)<huangy81@chinatelecom.cn>
>>>> Reviewed-by: Peter Xu<peterx@redhat.com>
>>>> ---
>>>>   accel/kvm/kvm-all.c         |  19 ++-
>>>>   include/hw/core/cpu.h       |   6 +
>>>>   include/sysemu/dirtylimit.h |  15 +++
>>>>   softmmu/dirtylimit.c        | 291 
>>>> ++++++++++++++++++++++++++++++++++++++++++++
>>>>   softmmu/trace-events        |   7 ++
>>>>   5 files changed, 337 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>>>> index 8821d80..98e43e6 100644
>>>> --- a/accel/kvm/kvm-all.c
>>>> +++ b/accel/kvm/kvm-all.c
>>>> @@ -45,6 +45,7 @@
>>>>   #include "qemu/guest-random.h"
>>>>   #include "sysemu/hw_accel.h"
>>>>   #include "kvm-cpus.h"
>>>> +#include "sysemu/dirtylimit.h"
>>>>     #include "hw/boards.h"
>>>>   @@ -476,6 +477,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
>>>>       cpu->kvm_state = s;
>>>>       cpu->vcpu_dirty = true;
>>>>       cpu->dirty_pages = 0;
>>>> +    cpu->throttle_us_per_full = 0;
>>>>         mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
>>>>       if (mmap_size < 0) {
>>>> @@ -1469,6 +1471,11 @@ static void 
>>>> *kvm_dirty_ring_reaper_thread(void *data)
>>>>            */
>>>>           sleep(1);
>>>>   +        /* keep sleeping so that dirtylimit not be interfered by 
>>>> reaper */
>>>> +        if (dirtylimit_in_service()) {
>>>> +            continue;
>>>> +        }
>>>> +
>>>>           trace_kvm_dirty_ring_reaper("wakeup");
>>>>           r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
>>>>   @@ -2964,8 +2971,18 @@ int kvm_cpu_exec(CPUState *cpu)
>>>>                */
>>>>               trace_kvm_dirty_ring_full(cpu->cpu_index);
>>>>               qemu_mutex_lock_iothread();
>>>> -            kvm_dirty_ring_reap(kvm_state, NULL);
>>>> +            /* We throttle vCPU by making it sleep once it exit 
>>>> from kernel
>>>> +             * due to dirty ring full. In the dirtylimit scenario, 
>>>> reaping
>>>> +             * all vCPUs after a single vCPU dirty ring get full 
>>>> result in
>>>> +             * the miss of sleep, so just reap the ring-fulled vCPU.
>>>> +             */
>>>> +            if (dirtylimit_in_service()) {
>>>> +                kvm_dirty_ring_reap(kvm_state, cpu);
>>>> +            } else {
>>>> +                kvm_dirty_ring_reap(kvm_state, NULL);
>>>> +            }
>>>>               qemu_mutex_unlock_iothread();
>>>> +            dirtylimit_vcpu_execute(cpu);
>>>>               ret = 0;
>>>>               break;
>>>>           case KVM_EXIT_SYSTEM_EVENT:
>>>> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
>>>> index 76ab3b8..dbeb31a 100644
>>>> --- a/include/hw/core/cpu.h
>>>> +++ b/include/hw/core/cpu.h
>>>> @@ -411,6 +411,12 @@ struct CPUState {
>>>>        */
>>>>       bool throttle_thread_scheduled;
>>>>   +    /*
>>>> +     * Sleep throttle_us_per_full microseconds once dirty ring is full
>>>> +     * if dirty page rate limit is enabled.
>>>> +     */
>>>> +    int64_t throttle_us_per_full;
>>>> +
>>>>       bool ignore_memory_transaction_failures;
>>>>         /* Used for user-only emulation of prctl(PR_SET_UNALIGN). */
>>>> diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
>>>> index da459f0..8d2c1f3 100644
>>>> --- a/include/sysemu/dirtylimit.h
>>>> +++ b/include/sysemu/dirtylimit.h
>>>> @@ -19,4 +19,19 @@ void vcpu_dirty_rate_stat_start(void);
>>>>   void vcpu_dirty_rate_stat_stop(void);
>>>>   void vcpu_dirty_rate_stat_initialize(void);
>>>>   void vcpu_dirty_rate_stat_finalize(void);
>>>> +
>>>> +void dirtylimit_state_lock(void);
>>>> +void dirtylimit_state_unlock(void);
>>>> +void dirtylimit_state_initialize(void);
>>>> +void dirtylimit_state_finalize(void);
>>>> +bool dirtylimit_in_service(void);
>>>> +bool dirtylimit_vcpu_index_valid(int cpu_index);
>>>> +void dirtylimit_process(void);
>>>> +void dirtylimit_change(bool start);
>>>> +void dirtylimit_set_vcpu(int cpu_index,
>>>> +                         uint64_t quota,
>>>> +                         bool enable);
>>>> +void dirtylimit_set_all(uint64_t quota,
>>>> +                        bool enable);
>>>> +void dirtylimit_vcpu_execute(CPUState *cpu);
>>>>   #endif
>>>> diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
>>>> index 6102e8c..76d0b44 100644
>>>> --- a/softmmu/dirtylimit.c
>>>> +++ b/softmmu/dirtylimit.c
>>>> @@ -18,6 +18,26 @@
>>>>   #include "sysemu/dirtylimit.h"
>>>>   #include "exec/memory.h"
>>>>   #include "hw/boards.h"
>>>> +#include "sysemu/kvm.h"
>>>> +#include "trace.h"
>>>> +
>>>> +/*
>>>> + * Dirtylimit stop working if dirty page rate error
>>>> + * value less than DIRTYLIMIT_TOLERANCE_RANGE
>>>> + */
>>>> +#define DIRTYLIMIT_TOLERANCE_RANGE  25  /* MB/s */
>>>> +/*
>>>> + * Plus or minus vcpu sleep time linearly if dirty
>>>> + * page rate error value percentage over
>>>> + * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
>>>> + * Otherwise, plus or minus a fixed vcpu sleep time.
>>>> + */
>>>> +#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT     50
>>>> +/*
>>>> + * Max vcpu sleep time percentage during a cycle
>>>> + * composed of dirty ring full and sleep time.
>>>> + */
>>>> +#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
>>>>     struct {
>>>>       VcpuStat stat;
>>>> @@ -25,6 +45,30 @@ struct {
>>>>       QemuThread thread;
>>>>   } *vcpu_dirty_rate_stat;
>>>>   +typedef struct VcpuDirtyLimitState {
>>>> +    int cpu_index;
>>>> +    bool enabled;
>>>> +    /*
>>>> +     * Quota dirty page rate, unit is MB/s
>>>> +     * zero if not enabled.
>>>> +     */
>>>> +    uint64_t quota;
>>>> +} VcpuDirtyLimitState;
>>>> +
>>>> +struct {
>>>> +    VcpuDirtyLimitState *states;
>>>> +    /* Max cpus number configured by user */
>>>> +    int max_cpus;
>>>> +    /* Number of vcpu under dirtylimit */
>>>> +    int limited_nvcpu;
>>>> +} *dirtylimit_state;
>>>> +
>>>> +/* protect dirtylimit_state */
>>>> +static QemuMutex dirtylimit_mutex;
>>>> +
>>>> +/* dirtylimit thread quit if dirtylimit_quit is true */
>>>> +static bool dirtylimit_quit;
>>>> +
>>>>   static void vcpu_dirty_rate_stat_collect(void)
>>>>   {
>>>>       VcpuStat stat;
>>>> @@ -54,6 +98,9 @@ static void *vcpu_dirty_rate_stat_thread(void 
>>>> *opaque)
>>>>         while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
>>>>           vcpu_dirty_rate_stat_collect();
>>>> +        if (dirtylimit_in_service()) {
>>>> +            dirtylimit_process();
>>>> +        }
>>>>       }
>>>>         /* stop log sync */
>>>> @@ -86,9 +133,11 @@ void vcpu_dirty_rate_stat_start(void)
>>>>   void vcpu_dirty_rate_stat_stop(void)
>>>>   {
>>>>       qatomic_set(&vcpu_dirty_rate_stat->running, 0);
>>>> +    dirtylimit_state_unlock();
>>>>       qemu_mutex_unlock_iothread();
>>>>       qemu_thread_join(&vcpu_dirty_rate_stat->thread);
>>>>       qemu_mutex_lock_iothread();
>>>> +    dirtylimit_state_lock();
>>>>   }
>>>>     void vcpu_dirty_rate_stat_initialize(void)
>>>> @@ -114,3 +163,245 @@ void vcpu_dirty_rate_stat_finalize(void)
>>>>       free(vcpu_dirty_rate_stat);
>>>>       vcpu_dirty_rate_stat = NULL;
>>>>   }
>>>> +
>>>> +void dirtylimit_state_lock(void)
>>>> +{
>>>> +    qemu_mutex_lock(&dirtylimit_mutex);
>>>> +}
>>>> +
>>>> +void dirtylimit_state_unlock(void)
>>>> +{
>>>> +    qemu_mutex_unlock(&dirtylimit_mutex);
>>>> +}
>>>> +
>>>> +static void
>>>> +__attribute__((__constructor__)) dirtylimit_mutex_init(void)
>>>> +{
>>>> +    qemu_mutex_init(&dirtylimit_mutex);
>>>> +}
>>>> +
>>>> +static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int 
>>>> cpu_index)
>>>> +{
>>>> +    return &dirtylimit_state->states[cpu_index];
>>>> +}
>>>> +
>>>> +void dirtylimit_state_initialize(void)
>>>> +{
>>>> +    MachineState *ms = MACHINE(qdev_get_machine());
>>>> +    int max_cpus = ms->smp.max_cpus;
>>>> +    int i;
>>>> +
>>>> +    dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
>>>> +
>>>> +    dirtylimit_state->states =
>>>> +            g_malloc0(sizeof(VcpuDirtyLimitState) * max_cpus);
>>>> +
>>>> +    for (i = 0; i < max_cpus; i++) {
>>>> +        dirtylimit_state->states[i].cpu_index = i;
>>>> +    }
>>>> +
>>>> +    dirtylimit_state->max_cpus = max_cpus;
>>>> +    trace_dirtylimit_state_initialize(max_cpus);
>>>> +}
>>>> +
>>>> +void dirtylimit_state_finalize(void)
>>>> +{
>>>> +    free(dirtylimit_state->states);
>>>> +    dirtylimit_state->states = NULL;
>>>> +
>>>> +    free(dirtylimit_state);
>>>> +    dirtylimit_state = NULL;
>>>> +
>>>> +    trace_dirtylimit_state_finalize();
>>>> +}
>>>> +
>>>> +bool dirtylimit_in_service(void)
>>>> +{
>>>> +    return !!dirtylimit_state;
>>>> +}
>>>> +
>>>> +bool dirtylimit_vcpu_index_valid(int cpu_index)
>>>> +{
>>>> +    MachineState *ms = MACHINE(qdev_get_machine());
>>>> +
>>>> +    return !(cpu_index < 0 ||
>>>> +             cpu_index >= ms->smp.max_cpus);
>>>> +}
>>>> +
>>>> +static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t 
>>>> dirtyrate)
>>>> +{
>>>> +    static uint64_t max_dirtyrate;
>>>> +    uint32_t dirty_ring_size = kvm_dirty_ring_size();
>>>> +    uint64_t dirty_ring_size_meory_MB =
>>>> +        dirty_ring_size * TARGET_PAGE_SIZE >> 20;
>>>> +
>>>> +    if (max_dirtyrate < dirtyrate) {
>>>> +        max_dirtyrate = dirtyrate;
>>>> +    }
>>>> +
>>>> +    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
>>>> +}
>>>> +
>>>> +static inline bool dirtylimit_done(uint64_t quota,
>>>> +                                   uint64_t current)
>>>> +{
>>>> +    uint64_t min, max;
>>>> +
>>>> +    min = MIN(quota, current);
>>>> +    max = MAX(quota, current);
>>>> +
>>>> +    return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
>>>> +}
>>>> +
>>>> +static inline bool
>>>> +dirtylimit_need_linear_adjustment(uint64_t quota,
>>>> +                                  uint64_t current)
>>>> +{
>>>> +    uint64_t min, max;
>>>> +
>>>> +    min = MIN(quota, current);
>>>> +    max = MAX(quota, current);
>>>> +
>>>> +    return ((max - min) * 100 / max) > 
>>>> DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
>>>> +}
>>>> +
>>>> +static void dirtylimit_set_throttle(CPUState *cpu,
>>>> +                                    uint64_t quota,
>>>> +                                    uint64_t current)
>>>> +{
>>>> +    int64_t ring_full_time_us = 0;
>>>> +    uint64_t sleep_pct = 0;
>>>> +    uint64_t throttle_us = 0;
>>>> +
>>>> +    if (current == 0) {
>>>> +        cpu->throttle_us_per_full = 0;
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    ring_full_time_us = dirtylimit_dirty_ring_full_time(current);
>>>> +
>>>> +    if (dirtylimit_need_linear_adjustment(quota, current)) {
>>>> +        if (quota < current) {
>>>> +            sleep_pct = (current - quota) * 100 / current;
>>>> +            throttle_us =
>>>> +                ring_full_time_us * sleep_pct / (double)(100 - 
>>>> sleep_pct);
>>>> +            cpu->throttle_us_per_full += throttle_us;
>>>> +        } else {
>>>> +            sleep_pct = (quota - current) * 100 / quota;
>>>> +            throttle_us =
>>>> +                ring_full_time_us * sleep_pct / (double)(100 - 
>>>> sleep_pct);
>>>> +            cpu->throttle_us_per_full -= throttle_us;
>>>> +        }
>>>> +
>>>> +        trace_dirtylimit_throttle_pct(cpu->cpu_index,
>>>> +                                      sleep_pct,
>>>> +                                      throttle_us);
>>>> +    } else {
>>>> +        if (quota < current) {
>>>> +            cpu->throttle_us_per_full += ring_full_time_us / 10;
>>>> +        } else {
>>>> +            cpu->throttle_us_per_full -= ring_full_time_us / 10;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or 
>>>> other scenario),
>>>> +     *       current dirty page rate may never reach the quota, we 
>>>> should stop
>>>> +     *       increasing sleep time?
>>>> +     */
>>>> +    cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
>>>> +        ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
>>>> +
>>>> +    cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0);
>>>> +}
>>>> +
>>>> +static void dirtylimit_adjust_throttle(CPUState *cpu)
>>>> +{
>>>> +    uint64_t quota = 0;
>>>> +    uint64_t current = 0;
>>>> +    int cpu_index = cpu->cpu_index;
>>>> +
>>>> +    quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
>>>> +    current = vcpu_dirty_rate_get(cpu_index);
>>>> +
>>>> +    if (!dirtylimit_done(quota, current)) {
>>>> +        dirtylimit_set_throttle(cpu, quota, current);
>>>> +    }
>>>> +
>>>> +    return;
>>>> +}
>>>> +
>>>> +void dirtylimit_process(void)
>>>> +{
>>>> +    CPUState *cpu;
>>>> +
>>>> +    if (!qatomic_read(&dirtylimit_quit)) {
>>>> +        dirtylimit_state_lock();
>>>> +
>>>> +        if (!dirtylimit_in_service()) {
>>>> +            dirtylimit_state_unlock();
>>>> +            return;
>>>> +        }
>>>> +
>>>> +        CPU_FOREACH(cpu) {
>>>> +            if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
>>>> +                continue;
>>>> +            }
>>>> +            dirtylimit_adjust_throttle(cpu);
>>>> +        }
>>>> +        dirtylimit_state_unlock();
>>>> +    }
>>>> +}
>>>> +
>>>> +void dirtylimit_change(bool start)
>>>> +{
>>>> +    if (start) {
>>>> +        qatomic_set(&dirtylimit_quit, 0);
>>>> +    } else {
>>>> +        qatomic_set(&dirtylimit_quit, 1);
>>>> +    }
>>>> +}
>>>> +
>>>> +void dirtylimit_set_vcpu(int cpu_index,
>>>> +                         uint64_t quota,
>>>> +                         bool enable)
>>>> +{
>>>> +    trace_dirtylimit_set_vcpu(cpu_index, quota);
>>>> +
>>>> +    if (enable) {
>>>> +        dirtylimit_state->states[cpu_index].quota = quota;
>>>> +        if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
>>>> +            dirtylimit_state->limited_nvcpu++;
>>>> +        }
>>>> +    } else {
>>>> +        dirtylimit_state->states[cpu_index].quota = 0;
>>>> +        if (dirtylimit_state->states[cpu_index].enabled) {
>>>> +            dirtylimit_state->limited_nvcpu--;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    dirtylimit_state->states[cpu_index].enabled = enable;
>>>> +}
>>>> +
>>>> +void dirtylimit_set_all(uint64_t quota,
>>>> +                        bool enable)
>>>> +{
>>>> +    MachineState *ms = MACHINE(qdev_get_machine());
>>>> +    int max_cpus = ms->smp.max_cpus;
>>>> +    int i;
>>>> +
>>>> +    for (i = 0; i < max_cpus; i++) {
>>>> +        dirtylimit_set_vcpu(i, quota, enable);
>>>> +    }
>>>> +}
>>>> +
>>>> +void dirtylimit_vcpu_execute(CPUState *cpu)
>>>> +{
>>>> +    if (dirtylimit_in_service() &&
>>>> + dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled &&
>>>> +        cpu->throttle_us_per_full) {
>>>> +        trace_dirtylimit_vcpu_execute(cpu->cpu_index,
>>>> +                cpu->throttle_us_per_full);
>>>> +        usleep(cpu->throttle_us_per_full);
>>>> +    }
>>>> +}
>>>> diff --git a/softmmu/trace-events b/softmmu/trace-events
>>>> index 9c88887..22606dc 100644
>>>> --- a/softmmu/trace-events
>>>> +++ b/softmmu/trace-events
>>>> @@ -31,3 +31,10 @@ runstate_set(int current_state, const char 
>>>> *current_state_str, int new_state, co
>>>>   system_wakeup_request(int reason) "reason=%d"
>>>>   qemu_system_shutdown_request(int reason) "reason=%d"
>>>>   qemu_system_powerdown_request(void) ""
>>>> +
>>>> +#dirtylimit.c
>>>> +dirtylimit_state_initialize(int max_cpus) "dirtylimit state 
>>>> initialize: max cpus %d"
>>>> +dirtylimit_state_finalize(void)
>>>> +dirtylimit_throttle_pct(int cpu_index, uint64_t pct, int64_t 
>>>> time_us) "CPU[%d] throttle percent: %" PRIu64 ", throttle adjust 
>>>> time %"PRIi64 " us"
>>>> +dirtylimit_set_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set 
>>>> dirty page rate limit %"PRIu64
>>>> +dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) 
>>>> "CPU[%d] sleep %"PRIi64 " us"
>>

-- 
Best regard

Hyman Huang(黄勇)

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

Posted by Peter Xu 3 years, 8 months ago

On Wed, May 25, 2022 at 11:38:26PM +0800, Hyman Huang wrote:
> > 2. Also this algorithm only control or limits dirty rate by guest
> > writes. There can be some memory dirtying done by virtio based devices
> > which is accounted only at qemu level so may not be accounted through
> > dirty rings so do we have plan for that in future? Those are not issue
> > for auto-converge as it slows full VM but dirty rate limit only slows
> > guest writes.
> > 
> From the migration point of view, time spent on migrating memory is far
> greater than migrating devices emulated by qemu. I think we can do that when
> migrating device costs the same magnitude time as migrating memory.
> 
> As to auto-converge, it throttle vcpu by kicking it and force it to sleep
> periodically. The two seems has no much difference from the perspective of
> internal method but the auto-converge is kind of "offensive" when doing
> restraint. I'll read the auto-converge implementation code and figure out
> the problem you point out.

This seems to be not virtio-specific, but can be applied to any device DMA
writting to guest mem (if not including vfio).  But indeed virtio can be
normally faster.

I'm also curious how fast a device DMA could dirty memories.  This could be
a question to answer to all vcpu-based throttling approaches (including the
quota based approach that was proposed on KVM list).  Maybe for kernel
virtio drivers we can have some easier estimation?  My guess is it'll be
much harder for DPDK-in-guest (aka userspace drivers) because IIUC that
could use a large chunk of guest mem.

[copy Jason too]

-- 
Peter Xu

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

Posted by Jason Wang 3 years, 8 months ago

On Wed, May 25, 2022 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Wed, May 25, 2022 at 11:38:26PM +0800, Hyman Huang wrote:
> > > 2. Also this algorithm only control or limits dirty rate by guest
> > > writes. There can be some memory dirtying done by virtio based devices
> > > which is accounted only at qemu level so may not be accounted through
> > > dirty rings so do we have plan for that in future? Those are not issue
> > > for auto-converge as it slows full VM but dirty rate limit only slows
> > > guest writes.
> > >
> > From the migration point of view, time spent on migrating memory is far
> > greater than migrating devices emulated by qemu. I think we can do that when
> > migrating device costs the same magnitude time as migrating memory.
> >
> > As to auto-converge, it throttle vcpu by kicking it and force it to sleep
> > periodically. The two seems has no much difference from the perspective of
> > internal method but the auto-converge is kind of "offensive" when doing
> > restraint. I'll read the auto-converge implementation code and figure out
> > the problem you point out.
>
> This seems to be not virtio-specific, but can be applied to any device DMA
> writting to guest mem (if not including vfio).  But indeed virtio can be
> normally faster.
>
> I'm also curious how fast a device DMA could dirty memories.  This could be
> a question to answer to all vcpu-based throttling approaches (including the
> quota based approach that was proposed on KVM list).  Maybe for kernel
> virtio drivers we can have some easier estimation?

As you said below, it really depends on the speed of the backend.

>  My guess is it'll be
> much harder for DPDK-in-guest (aka userspace drivers) because IIUC that
> could use a large chunk of guest mem.

Probably, for vhost-user backend, it could be ~20Mpps or even higher.

Thanks

>
> [copy Jason too]
>
> --
> Peter Xu
>

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

Posted by manish.mishra 3 years, 8 months ago

On 26/05/22 8:21 am, Jason Wang wrote:
> On Wed, May 25, 2022 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
>> On Wed, May 25, 2022 at 11:38:26PM +0800, Hyman Huang wrote:
>>>> 2. Also this algorithm only control or limits dirty rate by guest
>>>> writes. There can be some memory dirtying done by virtio based devices
>>>> which is accounted only at qemu level so may not be accounted through
>>>> dirty rings so do we have plan for that in future? Those are not issue
>>>> for auto-converge as it slows full VM but dirty rate limit only slows
>>>> guest writes.
>>>>
>>>  From the migration point of view, time spent on migrating memory is far
>>> greater than migrating devices emulated by qemu. I think we can do that when
>>> migrating device costs the same magnitude time as migrating memory.
>>>
>>> As to auto-converge, it throttle vcpu by kicking it and force it to sleep
>>> periodically. The two seems has no much difference from the perspective of
>>> internal method but the auto-converge is kind of "offensive" when doing
>>> restraint. I'll read the auto-converge implementation code and figure out
>>> the problem you point out.
>> This seems to be not virtio-specific, but can be applied to any device DMA
>> writting to guest mem (if not including vfio).  But indeed virtio can be
>> normally faster.
>>
>> I'm also curious how fast a device DMA could dirty memories.  This could be
>> a question to answer to all vcpu-based throttling approaches (including the
>> quota based approach that was proposed on KVM list).  Maybe for kernel
>> virtio drivers we can have some easier estimation?
> As you said below, it really depends on the speed of the backend.
>
>>   My guess is it'll be
>> much harder for DPDK-in-guest (aka userspace drivers) because IIUC that
>> could use a large chunk of guest mem.
> Probably, for vhost-user backend, it could be ~20Mpps or even higher.

Sorry for late response on this. We did experiment with IO on virtio-scsi based disk.

We could see dirty rate of ~500MBps on my system and most of that was not tracked

as kvm_dirty_log. Also for reference i am attaching test we used to avoid tacking

in KVM. (as attached file).

>
> Thanks
>
>> [copy Jason too]
>>
>> --
>> Peter Xu
>>#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <time.h>
#include <unistd.h>

#define PAGE_SIZE 4096
#define GB (1024 * 1024 * 1024)

int main()
{
    char *buff;
    size_t size;
    struct stat stat;
    // Take file of size atleast double of RAM size to
    // achieve max dirty rate possible.
    const char * file_name = "file_10_gb";
    int fd;
    size_t i = 0, count = 0;
    struct timespec ts1, ts0;
    double time_diff;

    fd = open(file_name, O_RDONLY);
    if (fd == -1) {
       perror("Error opening file");
       exit(1);
    }

    fstat (fd, &stat);
    size = stat.st_size;
    printf("File size %ld\n", (long)size);

    buff = (char *)mmap(0, size, PROT_READ, MAP_PRIVATE, fd, 0);
    if (buff == MAP_FAILED) {
       perror("Mmap Error");
       exit(1);
    }

    (void)clock_gettime(CLOCK_MONOTONIC, &ts0);

    while(1) {
       char c;

       i = (i + PAGE_SIZE) % size;
       c = buff[i];
       count++;
       // Check on every 10K pages for rate.
       if (count % 10000 == 0) {
          (void)clock_gettime(CLOCK_MONOTONIC, &ts1);
          time_diff = ((double)ts1.tv_sec  + ts1.tv_nsec * 1.0e-9) -((double)ts0.tv_sec + ts0.tv_nsec * 1.0e-9);
          printf("Expected Dirty rate %f\n", (10000.0 * PAGE_SIZE) / GB / time_diff);
          ts0 = ts1;
       }
    }

    close(fd);
    return 0;
}

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

Posted by Peter Xu 3 years, 8 months ago

On Mon, Jun 13, 2022 at 03:28:34PM +0530, manish.mishra wrote:
> 
> On 26/05/22 8:21 am, Jason Wang wrote:
> > On Wed, May 25, 2022 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
> > > On Wed, May 25, 2022 at 11:38:26PM +0800, Hyman Huang wrote:
> > > > > 2. Also this algorithm only control or limits dirty rate by guest
> > > > > writes. There can be some memory dirtying done by virtio based devices
> > > > > which is accounted only at qemu level so may not be accounted through
> > > > > dirty rings so do we have plan for that in future? Those are not issue
> > > > > for auto-converge as it slows full VM but dirty rate limit only slows
> > > > > guest writes.
> > > > > 
> > > >  From the migration point of view, time spent on migrating memory is far
> > > > greater than migrating devices emulated by qemu. I think we can do that when
> > > > migrating device costs the same magnitude time as migrating memory.
> > > > 
> > > > As to auto-converge, it throttle vcpu by kicking it and force it to sleep
> > > > periodically. The two seems has no much difference from the perspective of
> > > > internal method but the auto-converge is kind of "offensive" when doing
> > > > restraint. I'll read the auto-converge implementation code and figure out
> > > > the problem you point out.
> > > This seems to be not virtio-specific, but can be applied to any device DMA
> > > writting to guest mem (if not including vfio).  But indeed virtio can be
> > > normally faster.
> > > 
> > > I'm also curious how fast a device DMA could dirty memories.  This could be
> > > a question to answer to all vcpu-based throttling approaches (including the
> > > quota based approach that was proposed on KVM list).  Maybe for kernel
> > > virtio drivers we can have some easier estimation?
> > As you said below, it really depends on the speed of the backend.
> > 
> > >   My guess is it'll be
> > > much harder for DPDK-in-guest (aka userspace drivers) because IIUC that
> > > could use a large chunk of guest mem.
> > Probably, for vhost-user backend, it could be ~20Mpps or even higher.
> 
> Sorry for late response on this. We did experiment with IO on virtio-scsi based disk.

Thanks for trying this and sharing it out.

> 
> We could see dirty rate of ~500MBps on my system and most of that was not tracked
> 
> as kvm_dirty_log. Also for reference i am attaching test we used to avoid tacking
> 
> in KVM. (as attached file).

The number looks sane as it seems to be the sequential bandwidth for a
disk, though I'm not 100% sure it'll work as expected since you mmap()ed
the region with private pages rather than shared, so after you did I'm
wondering whether below will happen (also based on the fact that you mapped
twice the size of guest mem as you mentioned in the comment):

  (1) Swap out will start to trigger after you read a lot of data into the
      mem already, then old-read pages will be swapped out to disk (and
      hopefully the swap device does not reside on the same virtio-scsi
      disk or it'll be even more complicated scenario of mixture IOs..),
      meanwhile when you finish reading a round and start to read from
      offset 0 swap-in will start to happen too.  Swapping can slow down
      things already, and I'm wondering whether the 500MB/s was really
      caused by the swapout rather than backend disk reads.  More below.

  (2) Another attribute of private pages AFAICT is after you read it once
      it does not need to be read again from the virtio-scsi disks.  In
      other words, I'm thinking whether starting from the 2nd iteration
      your program won't trigger any DMA at all but purely torturing the
      swap device.

Maybe changing MAP_PRIVATE to MAP_SHARED can emulate better on what we want
to measure, but I'm also not 100% sure on whether it could be accurate..

Thanks,

> 
> > 
> > Thanks
> > 
> > > [copy Jason too]
> > > 
> > > --
> > > Peter Xu
> > > 
> #include <fcntl.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <sys/mman.h>
> #include <sys/stat.h>
> #include <sys/time.h>
> #include <time.h>
> #include <unistd.h>
> 
> #define PAGE_SIZE 4096
> #define GB (1024 * 1024 * 1024)
> 
> int main()
> {
>     char *buff;
>     size_t size;
>     struct stat stat;
>     // Take file of size atleast double of RAM size to
>     // achieve max dirty rate possible.
>     const char * file_name = "file_10_gb";
>     int fd;
>     size_t i = 0, count = 0;
>     struct timespec ts1, ts0;
>     double time_diff;
> 
>     fd = open(file_name, O_RDONLY);
>     if (fd == -1) {
>        perror("Error opening file");
>        exit(1);
>     }
> 
>     fstat (fd, &stat);
>     size = stat.st_size;
>     printf("File size %ld\n", (long)size);
> 
>     buff = (char *)mmap(0, size, PROT_READ, MAP_PRIVATE, fd, 0);
>     if (buff == MAP_FAILED) {
>        perror("Mmap Error");
>        exit(1);
>     }
> 
>     (void)clock_gettime(CLOCK_MONOTONIC, &ts0);
> 
>     while(1) {
>        char c;
> 
>        i = (i + PAGE_SIZE) % size;
>        c = buff[i];
>        count++;
>        // Check on every 10K pages for rate.
>        if (count % 10000 == 0) {
>           (void)clock_gettime(CLOCK_MONOTONIC, &ts1);
>           time_diff = ((double)ts1.tv_sec  + ts1.tv_nsec * 1.0e-9) -((double)ts0.tv_sec + ts0.tv_nsec * 1.0e-9);
>           printf("Expected Dirty rate %f\n", (10000.0 * PAGE_SIZE) / GB / time_diff);
>           ts0 = ts1;
>        }
>     }
> 
>     close(fd);
>     return 0;
> }


-- 
Peter Xu

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

Posted by manish.mishra 3 years, 8 months ago

On 13/06/22 8:03 pm, Peter Xu wrote:
> On Mon, Jun 13, 2022 at 03:28:34PM +0530, manish.mishra wrote:
>> On 26/05/22 8:21 am, Jason Wang wrote:
>>> On Wed, May 25, 2022 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
>>>> On Wed, May 25, 2022 at 11:38:26PM +0800, Hyman Huang wrote:
>>>>>> 2. Also this algorithm only control or limits dirty rate by guest
>>>>>> writes. There can be some memory dirtying done by virtio based devices
>>>>>> which is accounted only at qemu level so may not be accounted through
>>>>>> dirty rings so do we have plan for that in future? Those are not issue
>>>>>> for auto-converge as it slows full VM but dirty rate limit only slows
>>>>>> guest writes.
>>>>>>
>>>>>   From the migration point of view, time spent on migrating memory is far
>>>>> greater than migrating devices emulated by qemu. I think we can do that when
>>>>> migrating device costs the same magnitude time as migrating memory.
>>>>>
>>>>> As to auto-converge, it throttle vcpu by kicking it and force it to sleep
>>>>> periodically. The two seems has no much difference from the perspective of
>>>>> internal method but the auto-converge is kind of "offensive" when doing
>>>>> restraint. I'll read the auto-converge implementation code and figure out
>>>>> the problem you point out.
>>>> This seems to be not virtio-specific, but can be applied to any device DMA
>>>> writting to guest mem (if not including vfio).  But indeed virtio can be
>>>> normally faster.
>>>>
>>>> I'm also curious how fast a device DMA could dirty memories.  This could be
>>>> a question to answer to all vcpu-based throttling approaches (including the
>>>> quota based approach that was proposed on KVM list).  Maybe for kernel
>>>> virtio drivers we can have some easier estimation?
>>> As you said below, it really depends on the speed of the backend.
>>>
>>>>    My guess is it'll be
>>>> much harder for DPDK-in-guest (aka userspace drivers) because IIUC that
>>>> could use a large chunk of guest mem.
>>> Probably, for vhost-user backend, it could be ~20Mpps or even higher.
>> Sorry for late response on this. We did experiment with IO on virtio-scsi based disk.
> Thanks for trying this and sharing it out.
>
>> We could see dirty rate of ~500MBps on my system and most of that was not tracked
>>
>> as kvm_dirty_log. Also for reference i am attaching test we used to avoid tacking
>>
>> in KVM. (as attached file).
> The number looks sane as it seems to be the sequential bandwidth for a
> disk, though I'm not 100% sure it'll work as expected since you mmap()ed
> the region with private pages rather than shared, so after you did I'm
> wondering whether below will happen (also based on the fact that you mapped
> twice the size of guest mem as you mentioned in the comment):
>
>    (1) Swap out will start to trigger after you read a lot of data into the
>        mem already, then old-read pages will be swapped out to disk (and
>        hopefully the swap device does not reside on the same virtio-scsi
>        disk or it'll be even more complicated scenario of mixture IOs..),
>        meanwhile when you finish reading a round and start to read from
>        offset 0 swap-in will start to happen too.  Swapping can slow down
>        things already, and I'm wondering whether the 500MB/s was really
>        caused by the swapout rather than backend disk reads.  More below.
>
>    (2) Another attribute of private pages AFAICT is after you read it once
>        it does not need to be read again from the virtio-scsi disks.  In
>        other words, I'm thinking whether starting from the 2nd iteration
>        your program won't trigger any DMA at all but purely torturing the
>        swap device.
>
> Maybe changing MAP_PRIVATE to MAP_SHARED can emulate better on what we want
> to measure, but I'm also not 100% sure on whether it could be accurate..
>
> Thanks,
>
Thanks Peter, Yes agree MAP_SHARED should be used here, sorry i missed that 😁.

Yes, my purpose of taking file size larger than RAM_SIZE was to cause

frequent page cache flush and re-populating page-cache pages, not to

trigger swaps. I checked on my VM i had swapping disabled, may be

MAP_PRIVATE did not make difference because it was read-only.

I tested again with MAP_SHARED it comes around ~500MBps.

Thanks

Manish Mishra

>>> Thanks
>>>
>>>> [copy Jason too]
>>>>
>>>> --
>>>> Peter Xu
>>>>
>> #include <fcntl.h>
>> #include <stdio.h>
>> #include <stdlib.h>
>> #include <sys/mman.h>
>> #include <sys/stat.h>
>> #include <sys/time.h>
>> #include <time.h>
>> #include <unistd.h>
>>
>> #define PAGE_SIZE 4096
>> #define GB (1024 * 1024 * 1024)
>>
>> int main()
>> {
>>      char *buff;
>>      size_t size;
>>      struct stat stat;
>>      // Take file of size atleast double of RAM size to
>>      // achieve max dirty rate possible.
>>      const char * file_name = "file_10_gb";
>>      int fd;
>>      size_t i = 0, count = 0;
>>      struct timespec ts1, ts0;
>>      double time_diff;
>>
>>      fd = open(file_name, O_RDONLY);
>>      if (fd == -1) {
>>         perror("Error opening file");
>>         exit(1);
>>      }
>>
>>      fstat (fd, &stat);
>>      size = stat.st_size;
>>      printf("File size %ld\n", (long)size);
>>
>>      buff = (char *)mmap(0, size, PROT_READ, MAP_PRIVATE, fd, 0);
>>      if (buff == MAP_FAILED) {
>>         perror("Mmap Error");
>>         exit(1);
>>      }
>>
>>      (void)clock_gettime(CLOCK_MONOTONIC, &ts0);
>>
>>      while(1) {
>>         char c;
>>
>>         i = (i + PAGE_SIZE) % size;
>>         c = buff[i];
>>         count++;
>>         // Check on every 10K pages for rate.
>>         if (count % 10000 == 0) {
>>            (void)clock_gettime(CLOCK_MONOTONIC, &ts1);
>>            time_diff = ((double)ts1.tv_sec  + ts1.tv_nsec * 1.0e-9) -((double)ts0.tv_sec + ts0.tv_nsec * 1.0e-9);
>>            printf("Expected Dirty rate %f\n", (10000.0 * PAGE_SIZE) / GB / time_diff);
>>            ts0 = ts1;
>>         }
>>      }
>>
>>      close(fd);
>>      return 0;
>> }
>

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

Posted by Peter Xu 3 years, 8 months ago

On Mon, Jun 13, 2022 at 09:03:24PM +0530, manish.mishra wrote:
> 
> On 13/06/22 8:03 pm, Peter Xu wrote:
> > On Mon, Jun 13, 2022 at 03:28:34PM +0530, manish.mishra wrote:
> > > On 26/05/22 8:21 am, Jason Wang wrote:
> > > > On Wed, May 25, 2022 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
> > > > > On Wed, May 25, 2022 at 11:38:26PM +0800, Hyman Huang wrote:
> > > > > > > 2. Also this algorithm only control or limits dirty rate by guest
> > > > > > > writes. There can be some memory dirtying done by virtio based devices
> > > > > > > which is accounted only at qemu level so may not be accounted through
> > > > > > > dirty rings so do we have plan for that in future? Those are not issue
> > > > > > > for auto-converge as it slows full VM but dirty rate limit only slows
> > > > > > > guest writes.
> > > > > > > 
> > > > > >   From the migration point of view, time spent on migrating memory is far
> > > > > > greater than migrating devices emulated by qemu. I think we can do that when
> > > > > > migrating device costs the same magnitude time as migrating memory.
> > > > > > 
> > > > > > As to auto-converge, it throttle vcpu by kicking it and force it to sleep
> > > > > > periodically. The two seems has no much difference from the perspective of
> > > > > > internal method but the auto-converge is kind of "offensive" when doing
> > > > > > restraint. I'll read the auto-converge implementation code and figure out
> > > > > > the problem you point out.
> > > > > This seems to be not virtio-specific, but can be applied to any device DMA
> > > > > writting to guest mem (if not including vfio).  But indeed virtio can be
> > > > > normally faster.
> > > > > 
> > > > > I'm also curious how fast a device DMA could dirty memories.  This could be
> > > > > a question to answer to all vcpu-based throttling approaches (including the
> > > > > quota based approach that was proposed on KVM list).  Maybe for kernel
> > > > > virtio drivers we can have some easier estimation?
> > > > As you said below, it really depends on the speed of the backend.
> > > > 
> > > > >    My guess is it'll be
> > > > > much harder for DPDK-in-guest (aka userspace drivers) because IIUC that
> > > > > could use a large chunk of guest mem.
> > > > Probably, for vhost-user backend, it could be ~20Mpps or even higher.
> > > Sorry for late response on this. We did experiment with IO on virtio-scsi based disk.
> > Thanks for trying this and sharing it out.
> > 
> > > We could see dirty rate of ~500MBps on my system and most of that was not tracked
> > > 
> > > as kvm_dirty_log. Also for reference i am attaching test we used to avoid tacking
> > > 
> > > in KVM. (as attached file).
> > The number looks sane as it seems to be the sequential bandwidth for a
> > disk, though I'm not 100% sure it'll work as expected since you mmap()ed
> > the region with private pages rather than shared, so after you did I'm
> > wondering whether below will happen (also based on the fact that you mapped
> > twice the size of guest mem as you mentioned in the comment):
> > 
> >    (1) Swap out will start to trigger after you read a lot of data into the
> >        mem already, then old-read pages will be swapped out to disk (and
> >        hopefully the swap device does not reside on the same virtio-scsi
> >        disk or it'll be even more complicated scenario of mixture IOs..),
> >        meanwhile when you finish reading a round and start to read from
> >        offset 0 swap-in will start to happen too.  Swapping can slow down
> >        things already, and I'm wondering whether the 500MB/s was really
> >        caused by the swapout rather than backend disk reads.  More below.
> > 
> >    (2) Another attribute of private pages AFAICT is after you read it once
> >        it does not need to be read again from the virtio-scsi disks.  In
> >        other words, I'm thinking whether starting from the 2nd iteration
> >        your program won't trigger any DMA at all but purely torturing the
> >        swap device.
> > 
> > Maybe changing MAP_PRIVATE to MAP_SHARED can emulate better on what we want
> > to measure, but I'm also not 100% sure on whether it could be accurate..
> > 
> > Thanks,
> > 
> Thanks Peter, Yes agree MAP_SHARED should be used here, sorry i missed that 😁.
> 
> Yes, my purpose of taking file size larger than RAM_SIZE was to cause
> 
> frequent page cache flush and re-populating page-cache pages, not to
> 
> trigger swaps. I checked on my VM i had swapping disabled, may be
> 
> MAP_PRIVATE did not make difference because it was read-only.

Makes sense. And yeah I overlooked the RO part - indeed page cache will be
used for RO pages as long as never written.  So it'll behave like shared.

Otherwise for a swap-all-off you should have have hit OOM anyway and the
process probably will get killed sooner or later. :)

> 
> I tested again with MAP_SHARED it comes around ~500MBps.

Makes sense.  I'd guess that's the limitation of the virtio-scsi backend,
IOW the logical limitation of device dirtying memory could be unlimited
(e.g., when we put the virtio backend onto a ramdisk).

-- 
Peter Xu

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

Posted by manish.mishra 3 years, 8 months ago

On 26/05/22 8:21 am, Jason Wang wrote:
> On Wed, May 25, 2022 at 11:56 PM Peter Xu <peterx@redhat.com> wrote:
>> On Wed, May 25, 2022 at 11:38:26PM +0800, Hyman Huang wrote:
>>>> 2. Also this algorithm only control or limits dirty rate by guest
>>>> writes. There can be some memory dirtying done by virtio based devices
>>>> which is accounted only at qemu level so may not be accounted through
>>>> dirty rings so do we have plan for that in future? Those are not issue
>>>> for auto-converge as it slows full VM but dirty rate limit only slows
>>>> guest writes.
>>>>
>>>  From the migration point of view, time spent on migrating memory is far
>>> greater than migrating devices emulated by qemu. I think we can do that when
>>> migrating device costs the same magnitude time as migrating memory.
>>>
>>> As to auto-converge, it throttle vcpu by kicking it and force it to sleep
>>> periodically. The two seems has no much difference from the perspective of
>>> internal method but the auto-converge is kind of "offensive" when doing
>>> restraint. I'll read the auto-converge implementation code and figure out
>>> the problem you point out.
>> This seems to be not virtio-specific, but can be applied to any device DMA
>> writting to guest mem (if not including vfio).  But indeed virtio can be
>> normally faster.
>>
>> I'm also curious how fast a device DMA could dirty memories.  This could be
>> a question to answer to all vcpu-based throttling approaches (including the
>> quota based approach that was proposed on KVM list).  Maybe for kernel
>> virtio drivers we can have some easier estimation?
> As you said below, it really depends on the speed of the backend.
>
>>   My guess is it'll be
>> much harder for DPDK-in-guest (aka userspace drivers) because IIUC that
>> could use a large chunk of guest mem.
> Probably, for vhost-user backend, it could be ~20Mpps or even higher.
>
> Thanks
>
>> [copy Jason too]

I will try to get some numbers of this by next week. Jason Just wanted to get more

context why it should be ~20Mbps like it can be as much as throughput limit of

storage/network in worst case?

Also we were internally discussing to keep this kind of throttling not as an alternate

to auto-converge but somehow to run orthogonal to auto-converge with some modifications.

In cases where most dirty is by guest writes auto-converge anyway will not be active as it

decides throttle based on ratio of dirty/2*transferred which if is forced correctly by e.g.

dirty rate limit will be ~1. This is easiest approach we could think for start but can

definately be improved in future. May be something similar can be done for this dirty limit

approach too?

Surely not for this patch series but in future.

thanks

Manish Mishra


>> --
>> Peter Xu
>>