[v1] accel/tcg/tcg-accel-ops-rr: ensure fairness with icount

[PATCH] accel/tcg/tcg-accel-ops-rr: ensure fairness with icount

Posted by Jamie Iles 1 year, 1 month ago

The round-robin scheduler will iterate over the CPU list with an
assigned budget until the next timer expiry and may exit early because
of a TB exit.  This is fine under normal operation but with icount
enabled and SMP it is possible for a CPU to be starved of run time and
the system live-locks.

For example, booting a riscv64 platform with '-icount
shift=0,align=off,sleep=on -smp 2' we observe a livelock once the kernel
has timers enabled and starts performing TLB shootdowns.  In this case
we have CPU 0 in M-mode with interrupts disabled sending an IPI to CPU
1.  As we enter the TCG loop, we assign the icount budget to next timer
interrupt to CPU 0 and begin executing where the guest is sat in a busy
loop exhausting all of the budget before we try to execute CPU 1 which
is the target of the IPI but CPU 1 is left with no budget with which to
execute and the process repeats.

We try here to add some fairness by splitting the budget across all of
the CPUs on the thread fairly before entering each one.  The CPU count
is cached on CPU list generation ID to avoid iterating the list on each
loop iteration.  With this change it is possible to boot an SMP rv64
guest with icount enabled and no hangs.

Signed-off-by: Jamie Iles <quic_jiles@quicinc.com>
---
 accel/tcg/tcg-accel-ops-icount.c | 16 ++++++++++++++--
 accel/tcg/tcg-accel-ops-icount.h |  3 ++-
 accel/tcg/tcg-accel-ops-rr.c     | 26 +++++++++++++++++++++++++-
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/accel/tcg/tcg-accel-ops-icount.c b/accel/tcg/tcg-accel-ops-icount.c
index 84cc7421be88..a7ffc8a68bad 100644
--- a/accel/tcg/tcg-accel-ops-icount.c
+++ b/accel/tcg/tcg-accel-ops-icount.c
@@ -89,7 +89,19 @@ void icount_handle_deadline(void)
     }
 }
 
-void icount_prepare_for_run(CPUState *cpu)
+int64_t icount_cpu_timeslice(int cpu_count)
+{
+    int64_t limit = icount_get_limit();
+    int64_t timeslice = limit / cpu_count;
+
+    if (timeslice == 0) {
+        timeslice = limit;
+    }
+
+    return timeslice;
+}
+
+void icount_prepare_for_run(CPUState *cpu, int64_t timeslice)
 {
     int insns_left;
 
@@ -101,7 +113,7 @@ void icount_prepare_for_run(CPUState *cpu)
     g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
     g_assert(cpu->icount_extra == 0);
 
-    cpu->icount_budget = icount_get_limit();
+    cpu->icount_budget = MIN(icount_get_limit(), timeslice);
     insns_left = MIN(0xffff, cpu->icount_budget);
     cpu_neg(cpu)->icount_decr.u16.low = insns_left;
     cpu->icount_extra = cpu->icount_budget - insns_left;
diff --git a/accel/tcg/tcg-accel-ops-icount.h b/accel/tcg/tcg-accel-ops-icount.h
index 1b6fd9c60751..e8785a0e196d 100644
--- a/accel/tcg/tcg-accel-ops-icount.h
+++ b/accel/tcg/tcg-accel-ops-icount.h
@@ -11,7 +11,8 @@
 #define TCG_ACCEL_OPS_ICOUNT_H
 
 void icount_handle_deadline(void);
-void icount_prepare_for_run(CPUState *cpu);
+void icount_prepare_for_run(CPUState *cpu, int64_t timeslice);
+int64_t icount_cpu_timeslice(int cpu_count);
 void icount_process_data(CPUState *cpu);
 
 void icount_handle_interrupt(CPUState *cpu, int mask);
diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
index 290833a37fb2..bccb3670a656 100644
--- a/accel/tcg/tcg-accel-ops-rr.c
+++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -139,6 +139,25 @@ static void rr_force_rcu(Notifier *notify, void *data)
     rr_kick_next_cpu();
 }
 
+static int rr_cpu_count(void)
+{
+    static unsigned int last_gen_id = ~0;
+    static int cpu_count;
+    CPUState *cpu;
+
+    cpu_list_lock();
+    if (cpu_list_generation_id_get() != last_gen_id) {
+        cpu_count = 0;
+        CPU_FOREACH(cpu) {
+            ++cpu_count;
+        }
+        last_gen_id = cpu_list_generation_id_get();
+    }
+    cpu_list_unlock();
+
+    return cpu_count;
+}
+
 /*
  * In the single-threaded case each vCPU is simulated in turn. If
  * there is more than a single vCPU we create a simple timer to kick
@@ -185,6 +204,9 @@ static void *rr_cpu_thread_fn(void *arg)
     cpu->exit_request = 1;
 
     while (1) {
+        int cpu_count = rr_cpu_count();
+        int64_t icount_timeslice = INT64_MAX;
+
         qemu_mutex_unlock_iothread();
         replay_mutex_lock();
         qemu_mutex_lock_iothread();
@@ -197,6 +219,8 @@ static void *rr_cpu_thread_fn(void *arg)
              * waking up the I/O thread and waiting for completion.
              */
             icount_handle_deadline();
+
+            icount_timeslice = icount_cpu_timeslice(cpu_count);
         }
 
         replay_mutex_unlock();
@@ -218,7 +242,7 @@ static void *rr_cpu_thread_fn(void *arg)
 
                 qemu_mutex_unlock_iothread();
                 if (icount_enabled()) {
-                    icount_prepare_for_run(cpu);
+                    icount_prepare_for_run(cpu, icount_timeslice);
                 }
                 r = tcg_cpus_exec(cpu);
                 if (icount_enabled()) {
-- 
2.25.1

Re: [PATCH] accel/tcg/tcg-accel-ops-rr: ensure fairness with icount

Posted by Peter Maydell 1 year ago

On Fri, 14 Apr 2023 at 15:48, Jamie Iles <quic_jiles@quicinc.com> wrote:
>
> The round-robin scheduler will iterate over the CPU list with an
> assigned budget until the next timer expiry and may exit early because
> of a TB exit.  This is fine under normal operation but with icount
> enabled and SMP it is possible for a CPU to be starved of run time and
> the system live-locks.
>
> For example, booting a riscv64 platform with '-icount
> shift=0,align=off,sleep=on -smp 2' we observe a livelock once the kernel
> has timers enabled and starts performing TLB shootdowns.  In this case
> we have CPU 0 in M-mode with interrupts disabled sending an IPI to CPU
> 1.  As we enter the TCG loop, we assign the icount budget to next timer
> interrupt to CPU 0 and begin executing where the guest is sat in a busy
> loop exhausting all of the budget before we try to execute CPU 1 which
> is the target of the IPI but CPU 1 is left with no budget with which to
> execute and the process repeats.

Coincidentally, I just ran into a fairly similar issue on Arm,
where CPU0 is in a loop waiting for CPU1 to do something
but CPU1 never gets to execute any instructions at all
because it's run out of timeslice. This patch fixes that
livelock for me.

Tested-by: Peter Maydell <peter.maydell@linaro.org>

thanks
-- PMM

Re: [PATCH] accel/tcg/tcg-accel-ops-rr: ensure fairness with icount

Posted by Philippe Mathieu-Daudé 1 year ago

Hi Jamie,

On 14/4/23 12:51, Jamie Iles wrote:
> The round-robin scheduler will iterate over the CPU list with an
> assigned budget until the next timer expiry and may exit early because
> of a TB exit.  This is fine under normal operation but with icount
> enabled and SMP it is possible for a CPU to be starved of run time and
> the system live-locks.
> 
> For example, booting a riscv64 platform with '-icount
> shift=0,align=off,sleep=on -smp 2' we observe a livelock once the kernel
> has timers enabled and starts performing TLB shootdowns.  In this case
> we have CPU 0 in M-mode with interrupts disabled sending an IPI to CPU
> 1.  As we enter the TCG loop, we assign the icount budget to next timer
> interrupt to CPU 0 and begin executing where the guest is sat in a busy
> loop exhausting all of the budget before we try to execute CPU 1 which
> is the target of the IPI but CPU 1 is left with no budget with which to
> execute and the process repeats.
> 
> We try here to add some fairness by splitting the budget across all of
> the CPUs on the thread fairly before entering each one.  The CPU count
> is cached on CPU list generation ID to avoid iterating the list on each
> loop iteration.  With this change it is possible to boot an SMP rv64
> guest with icount enabled and no hangs.
> 
> Signed-off-by: Jamie Iles <quic_jiles@quicinc.com>
> ---
>   accel/tcg/tcg-accel-ops-icount.c | 16 ++++++++++++++--
>   accel/tcg/tcg-accel-ops-icount.h |  3 ++-
>   accel/tcg/tcg-accel-ops-rr.c     | 26 +++++++++++++++++++++++++-
>   3 files changed, 41 insertions(+), 4 deletions(-)
> 
> diff --git a/accel/tcg/tcg-accel-ops-icount.c b/accel/tcg/tcg-accel-ops-icount.c
> index 84cc7421be88..a7ffc8a68bad 100644
> --- a/accel/tcg/tcg-accel-ops-icount.c
> +++ b/accel/tcg/tcg-accel-ops-icount.c
> @@ -89,7 +89,19 @@ void icount_handle_deadline(void)
>       }
>   }
>   
> -void icount_prepare_for_run(CPUState *cpu)

    /* Return icount budget shared fairly across all CPUs */

Or rename to something more explicit such
icount_fair_shared_budget_per_cpu()?

> +int64_t icount_cpu_timeslice(int cpu_count)
> +{
> +    int64_t limit = icount_get_limit();
> +    int64_t timeslice = limit / cpu_count;
> +
> +    if (timeslice == 0) {
> +        timeslice = limit;
> +    }
> +
> +    return timeslice;
> +}
> +
> +void icount_prepare_for_run(CPUState *cpu, int64_t timeslice)

Maybe s/timeslice/per_cpu_icount_budget_max/, max_icount_budget_per_cpu
or a more descriptive variable name? Otherwise OK.

>   {
>       int insns_left;
>   
> @@ -101,7 +113,7 @@ void icount_prepare_for_run(CPUState *cpu)
>       g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
>       g_assert(cpu->icount_extra == 0);
>   
> -    cpu->icount_budget = icount_get_limit();
> +    cpu->icount_budget = MIN(icount_get_limit(), timeslice);

Alternatively we could pass timeslice as argument to icount_get_limit().

>       insns_left = MIN(0xffff, cpu->icount_budget);
>       cpu_neg(cpu)->icount_decr.u16.low = insns_left;
>       cpu->icount_extra = cpu->icount_budget - insns_left;
> diff --git a/accel/tcg/tcg-accel-ops-icount.h b/accel/tcg/tcg-accel-ops-icount.h
> index 1b6fd9c60751..e8785a0e196d 100644
> --- a/accel/tcg/tcg-accel-ops-icount.h
> +++ b/accel/tcg/tcg-accel-ops-icount.h
> @@ -11,7 +11,8 @@
>   #define TCG_ACCEL_OPS_ICOUNT_H
>   
>   void icount_handle_deadline(void);
> -void icount_prepare_for_run(CPUState *cpu);
> +void icount_prepare_for_run(CPUState *cpu, int64_t timeslice);
> +int64_t icount_cpu_timeslice(int cpu_count);
>   void icount_process_data(CPUState *cpu);
>   
>   void icount_handle_interrupt(CPUState *cpu, int mask);
> diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
> index 290833a37fb2..bccb3670a656 100644
> --- a/accel/tcg/tcg-accel-ops-rr.c
> +++ b/accel/tcg/tcg-accel-ops-rr.c
> @@ -139,6 +139,25 @@ static void rr_force_rcu(Notifier *notify, void *data)
>       rr_kick_next_cpu();
>   }
>   

Maybe worth adding a comment to remember "The CPU count is cached on CPU
list generation ID to avoid iterating the list on each loop iteration."

> +static int rr_cpu_count(void)
> +{
> +    static unsigned int last_gen_id = ~0;
> +    static int cpu_count;
> +    CPUState *cpu;
> +
> +    cpu_list_lock();
> +    if (cpu_list_generation_id_get() != last_gen_id) {
> +        cpu_count = 0;
> +        CPU_FOREACH(cpu) {
> +            ++cpu_count;
> +        }
> +        last_gen_id = cpu_list_generation_id_get();
> +    }
> +    cpu_list_unlock();
> +
> +    return cpu_count;
> +}
> +
>   /*
>    * In the single-threaded case each vCPU is simulated in turn. If
>    * there is more than a single vCPU we create a simple timer to kick
> @@ -185,6 +204,9 @@ static void *rr_cpu_thread_fn(void *arg)
>       cpu->exit_request = 1;
>   
>       while (1) {
> +        int cpu_count = rr_cpu_count();
> +        int64_t icount_timeslice = INT64_MAX;

s/icount_timeslice/icount_budget/?


Modulo the nitpicking comments on "timeslice" naming which I'm not
familiar with, and documenting a bit the code, LGTM.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>