[PATCH] arm/hvf: Optimize and simplify WFI handling

Peter Collingbourne via posted 1 patch 3 years, 5 months ago
Failed in applying to current master (apply log)
accel/hvf/hvf-cpus.c     |  5 +--
include/sysemu/hvf_int.h |  3 +-
target/arm/hvf/hvf.c     | 94 +++++++++++-----------------------------
3 files changed, 28 insertions(+), 74 deletions(-)
[PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Peter Collingbourne via 3 years, 5 months ago
Sleep on WFx until the VTIMER is due but allow ourselves to be woken
up on IPI.

Signed-off-by: Peter Collingbourne <pcc@google.com>
---
Alexander Graf wrote:
> I would love to take a patch from you here :). I'll still be stuck for a
> while with the sysreg sync rework that Peter asked for before I can look
> at WFI again.

Okay, here's a patch :) It's a relatively straightforward adaptation
of what we have in our fork, which can now boot Android to GUI while
remaining at around 4% CPU when idle.

I'm not set up to boot a full Linux distribution at the moment so I
tested it on upstream QEMU by running a recent mainline Linux kernel
with a rootfs containing an init program that just does sleep(5)
and verified that the qemu process remains at low CPU usage during
the sleep. This was on top of your v2 plus the last patch of your v1
since it doesn't look like you have a replacement for that logic yet.

 accel/hvf/hvf-cpus.c     |  5 +--
 include/sysemu/hvf_int.h |  3 +-
 target/arm/hvf/hvf.c     | 94 +++++++++++-----------------------------
 3 files changed, 28 insertions(+), 74 deletions(-)

diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
index 4360f64671..b2c8fb57f6 100644
--- a/accel/hvf/hvf-cpus.c
+++ b/accel/hvf/hvf-cpus.c
@@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
     sigact.sa_handler = dummy_signal;
     sigaction(SIG_IPI, &sigact, NULL);
 
-    pthread_sigmask(SIG_BLOCK, NULL, &set);
-    sigdelset(&set, SIG_IPI);
-    pthread_sigmask(SIG_SETMASK, &set, NULL);
+    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
+    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
 
 #ifdef __aarch64__
     r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
index c56baa3ae8..13adf6ea77 100644
--- a/include/sysemu/hvf_int.h
+++ b/include/sysemu/hvf_int.h
@@ -62,8 +62,7 @@ extern HVFState *hvf_state;
 struct hvf_vcpu_state {
     uint64_t fd;
     void *exit;
-    struct timespec ts;
-    bool sleeping;
+    sigset_t unblock_ipi_mask;
 };
 
 void assert_hvf_ok(hv_return_t ret);
diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
index 8fe10966d2..60a361ff38 100644
--- a/target/arm/hvf/hvf.c
+++ b/target/arm/hvf/hvf.c
@@ -2,6 +2,7 @@
  * QEMU Hypervisor.framework support for Apple Silicon
 
  * Copyright 2020 Alexander Graf <agraf@csgraf.de>
+ * Copyright 2020 Google LLC
  *
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
@@ -18,6 +19,7 @@
 #include "sysemu/hw_accel.h"
 
 #include <Hypervisor/Hypervisor.h>
+#include <mach/mach_time.h>
 
 #include "exec/address-spaces.h"
 #include "hw/irq.h"
@@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
 
 void hvf_kick_vcpu_thread(CPUState *cpu)
 {
-    if (cpu->hvf->sleeping) {
-        /*
-         * When sleeping, make sure we always send signals. Also, clear the
-         * timespec, so that an IPI that arrives between setting hvf->sleeping
-         * and the nanosleep syscall still aborts the sleep.
-         */
-        cpu->thread_kicked = false;
-        cpu->hvf->ts = (struct timespec){ };
-        cpus_kick_thread(cpu);
-    } else {
-        hv_vcpus_exit(&cpu->hvf->fd, 1);
-    }
+    cpus_kick_thread(cpu);
+    hv_vcpus_exit(&cpu->hvf->fd, 1);
 }
 
 static int hvf_inject_interrupts(CPUState *cpu)
@@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu)
         uint64_t syndrome = hvf_exit->exception.syndrome;
         uint32_t ec = syn_get_ec(syndrome);
 
+        qemu_mutex_lock_iothread();
         switch (exit_reason) {
         case HV_EXIT_REASON_EXCEPTION:
             /* This is the main one, handle below. */
             break;
         case HV_EXIT_REASON_VTIMER_ACTIVATED:
-            qemu_mutex_lock_iothread();
             current_cpu = cpu;
             qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
             qemu_mutex_unlock_iothread();
             continue;
         case HV_EXIT_REASON_CANCELED:
             /* we got kicked, no exit to process */
+            qemu_mutex_unlock_iothread();
             continue;
         default:
             assert(0);
@@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu)
             uint32_t srt = (syndrome >> 16) & 0x1f;
             uint64_t val = 0;
 
-            qemu_mutex_lock_iothread();
             current_cpu = cpu;
 
             DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
@@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu)
                 hvf_set_reg(cpu, srt, val);
             }
 
-            qemu_mutex_unlock_iothread();
-
             advance_pc = true;
             break;
         }
@@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu)
         case EC_WFX_TRAP:
             if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
                 (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
-                uint64_t cval, ctl, val, diff, now;
+                uint64_t cval;
 
-                /* Set up a local timer for vtimer if necessary ... */
-                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
-                assert_hvf_ok(r);
                 r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
                 assert_hvf_ok(r);
 
-                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
-                diff = cval - val;
-
-                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
-                      gt_cntfrq_period_ns(arm_cpu);
-
-                /* Timer disabled or masked, just wait for long */
-                if (!(ctl & 1) || (ctl & 2)) {
-                    diff = (120 * NANOSECONDS_PER_SECOND) /
-                           gt_cntfrq_period_ns(arm_cpu);
+                int64_t ticks_to_sleep = cval - mach_absolute_time();
+                if (ticks_to_sleep < 0) {
+                    break;
                 }
 
-                if (diff < INT64_MAX) {
-                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
-                    struct timespec *ts = &cpu->hvf->ts;
-
-                    *ts = (struct timespec){
-                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
-                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
-                    };
-
-                    /*
-                     * Waking up easily takes 1ms, don't go to sleep for smaller
-                     * time periods than 2ms.
-                     */
-                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
-                        advance_pc = true;
-                        break;
-                    }
-
-                    /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */
-                    cpu->hvf->sleeping = true;
-                    smp_mb();
-
-                    /* Bail out if we received an IRQ meanwhile */
-                    if (cpu->thread_kicked || (cpu->interrupt_request &
-                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
-                        cpu->hvf->sleeping = false;
-                        break;
-                    }
-
-                    /* nanosleep returns on signal, so we wake up on kick. */
-                    nanosleep(ts, NULL);
-
-                    /* Out of sleep - either naturally or because of a kick */
-                    cpu->hvf->sleeping = false;
-                }
+                uint64_t seconds = ticks_to_sleep / arm_cpu->gt_cntfrq_hz;
+                uint64_t nanos =
+                    (ticks_to_sleep - arm_cpu->gt_cntfrq_hz * seconds) *
+                    1000000000 / arm_cpu->gt_cntfrq_hz;
+                struct timespec ts = { seconds, nanos };
+
+                /*
+                 * Use pselect to sleep so that other threads can IPI us while
+                 * we're sleeping.
+                 */
+                qatomic_mb_set(&cpu->thread_kicked, false);
+                qemu_mutex_unlock_iothread();
+                pselect(0, 0, 0, 0, &ts, &cpu->hvf->unblock_ipi_mask);
+                qemu_mutex_lock_iothread();
 
                 advance_pc = true;
             }
             break;
         case EC_AA64_HVC:
             cpu_synchronize_state(cpu);
-            qemu_mutex_lock_iothread();
             current_cpu = cpu;
             if (arm_is_psci_call(arm_cpu, EXCP_HVC)) {
                 arm_handle_psci_call(arm_cpu);
@@ -562,11 +520,9 @@ int hvf_vcpu_exec(CPUState *cpu)
                 DPRINTF("unknown HVC! %016llx", env->xregs[0]);
                 env->xregs[0] = -1;
             }
-            qemu_mutex_unlock_iothread();
             break;
         case EC_AA64_SMC:
             cpu_synchronize_state(cpu);
-            qemu_mutex_lock_iothread();
             current_cpu = cpu;
             if (arm_is_psci_call(arm_cpu, EXCP_SMC)) {
                 arm_handle_psci_call(arm_cpu);
@@ -575,7 +531,6 @@ int hvf_vcpu_exec(CPUState *cpu)
                 env->xregs[0] = -1;
                 env->pc += 4;
             }
-            qemu_mutex_unlock_iothread();
             break;
         default:
             cpu_synchronize_state(cpu);
@@ -594,6 +549,7 @@ int hvf_vcpu_exec(CPUState *cpu)
             r = hv_vcpu_set_reg(cpu->hvf->fd, HV_REG_PC, pc);
             assert_hvf_ok(r);
         }
+        qemu_mutex_unlock_iothread();
     } while (ret == 0);
 
     qemu_mutex_lock_iothread();
-- 
2.29.2.454.gaff20da3a2-goog


Re: [PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Alexander Graf 3 years, 5 months ago
Hi Peter,

On 01.12.20 09:21, Peter Collingbourne wrote:
> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
> up on IPI.
>
> Signed-off-by: Peter Collingbourne <pcc@google.com>


Thanks a bunch!


> ---
> Alexander Graf wrote:
>> I would love to take a patch from you here :). I'll still be stuck for a
>> while with the sysreg sync rework that Peter asked for before I can look
>> at WFI again.
> Okay, here's a patch :) It's a relatively straightforward adaptation
> of what we have in our fork, which can now boot Android to GUI while
> remaining at around 4% CPU when idle.
>
> I'm not set up to boot a full Linux distribution at the moment so I
> tested it on upstream QEMU by running a recent mainline Linux kernel
> with a rootfs containing an init program that just does sleep(5)
> and verified that the qemu process remains at low CPU usage during
> the sleep. This was on top of your v2 plus the last patch of your v1
> since it doesn't look like you have a replacement for that logic yet.
>
>   accel/hvf/hvf-cpus.c     |  5 +--
>   include/sysemu/hvf_int.h |  3 +-
>   target/arm/hvf/hvf.c     | 94 +++++++++++-----------------------------
>   3 files changed, 28 insertions(+), 74 deletions(-)
>
> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
> index 4360f64671..b2c8fb57f6 100644
> --- a/accel/hvf/hvf-cpus.c
> +++ b/accel/hvf/hvf-cpus.c
> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
>       sigact.sa_handler = dummy_signal;
>       sigaction(SIG_IPI, &sigact, NULL);
>   
> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
> -    sigdelset(&set, SIG_IPI);
> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
> +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);


What will this do to the x86 hvf implementation? We're now not 
unblocking SIG_IPI again for that, right?


>   
>   #ifdef __aarch64__
>       r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
> index c56baa3ae8..13adf6ea77 100644
> --- a/include/sysemu/hvf_int.h
> +++ b/include/sysemu/hvf_int.h
> @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
>   struct hvf_vcpu_state {
>       uint64_t fd;
>       void *exit;
> -    struct timespec ts;
> -    bool sleeping;
> +    sigset_t unblock_ipi_mask;
>   };
>   
>   void assert_hvf_ok(hv_return_t ret);
> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
> index 8fe10966d2..60a361ff38 100644
> --- a/target/arm/hvf/hvf.c
> +++ b/target/arm/hvf/hvf.c
> @@ -2,6 +2,7 @@
>    * QEMU Hypervisor.framework support for Apple Silicon
>   
>    * Copyright 2020 Alexander Graf <agraf@csgraf.de>
> + * Copyright 2020 Google LLC
>    *
>    * This work is licensed under the terms of the GNU GPL, version 2 or later.
>    * See the COPYING file in the top-level directory.
> @@ -18,6 +19,7 @@
>   #include "sysemu/hw_accel.h"
>   
>   #include <Hypervisor/Hypervisor.h>
> +#include <mach/mach_time.h>
>   
>   #include "exec/address-spaces.h"
>   #include "hw/irq.h"
> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
>   
>   void hvf_kick_vcpu_thread(CPUState *cpu)
>   {
> -    if (cpu->hvf->sleeping) {
> -        /*
> -         * When sleeping, make sure we always send signals. Also, clear the
> -         * timespec, so that an IPI that arrives between setting hvf->sleeping
> -         * and the nanosleep syscall still aborts the sleep.
> -         */
> -        cpu->thread_kicked = false;
> -        cpu->hvf->ts = (struct timespec){ };
> -        cpus_kick_thread(cpu);
> -    } else {
> -        hv_vcpus_exit(&cpu->hvf->fd, 1);
> -    }
> +    cpus_kick_thread(cpu);
> +    hv_vcpus_exit(&cpu->hvf->fd, 1);


This means your first WFI will almost always return immediately due to a 
pending signal, because there probably was an IRQ pending before on the 
same CPU, no?


>   }
>   
>   static int hvf_inject_interrupts(CPUState *cpu)
> @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu)
>           uint64_t syndrome = hvf_exit->exception.syndrome;
>           uint32_t ec = syn_get_ec(syndrome);
>   
> +        qemu_mutex_lock_iothread();


Is there a particular reason you're moving the iothread lock out again 
from the individual bits? I would really like to keep a notion of fast 
path exits.


>           switch (exit_reason) {
>           case HV_EXIT_REASON_EXCEPTION:
>               /* This is the main one, handle below. */
>               break;
>           case HV_EXIT_REASON_VTIMER_ACTIVATED:
> -            qemu_mutex_lock_iothread();
>               current_cpu = cpu;
>               qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
>               qemu_mutex_unlock_iothread();
>               continue;
>           case HV_EXIT_REASON_CANCELED:
>               /* we got kicked, no exit to process */
> +            qemu_mutex_unlock_iothread();
>               continue;
>           default:
>               assert(0);
> @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu)
>               uint32_t srt = (syndrome >> 16) & 0x1f;
>               uint64_t val = 0;
>   
> -            qemu_mutex_lock_iothread();
>               current_cpu = cpu;
>   
>               DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
> @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu)
>                   hvf_set_reg(cpu, srt, val);
>               }
>   
> -            qemu_mutex_unlock_iothread();
> -
>               advance_pc = true;
>               break;
>           }
> @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu)
>           case EC_WFX_TRAP:
>               if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
>                   (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> -                uint64_t cval, ctl, val, diff, now;
> +                uint64_t cval;
>   
> -                /* Set up a local timer for vtimer if necessary ... */
> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
> -                assert_hvf_ok(r);
>                   r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
>                   assert_hvf_ok(r);
>   
> -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
> -                diff = cval - val;
> -
> -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
> -                      gt_cntfrq_period_ns(arm_cpu);
> -
> -                /* Timer disabled or masked, just wait for long */
> -                if (!(ctl & 1) || (ctl & 2)) {
> -                    diff = (120 * NANOSECONDS_PER_SECOND) /
> -                           gt_cntfrq_period_ns(arm_cpu);
> +                int64_t ticks_to_sleep = cval - mach_absolute_time();
> +                if (ticks_to_sleep < 0) {
> +                    break;


This will loop at 100% for Windows, which configures the vtimer as 
cval=0 ctl=7, so with IRQ mask bit set.


Alex


>                   }
>   
> -                if (diff < INT64_MAX) {
> -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
> -                    struct timespec *ts = &cpu->hvf->ts;
> -
> -                    *ts = (struct timespec){
> -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
> -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
> -                    };
> -
> -                    /*
> -                     * Waking up easily takes 1ms, don't go to sleep for smaller
> -                     * time periods than 2ms.
> -                     */
> -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {


I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to 
return. Without logic like this, super short WFIs will hurt performance 
quite badly.


Alex

> -                        advance_pc = true;
> -                        break;
> -                    }
> -
> -                    /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */
> -                    cpu->hvf->sleeping = true;
> -                    smp_mb();
> -
> -                    /* Bail out if we received an IRQ meanwhile */
> -                    if (cpu->thread_kicked || (cpu->interrupt_request &
> -                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> -                        cpu->hvf->sleeping = false;
> -                        break;
> -                    }
> -
> -                    /* nanosleep returns on signal, so we wake up on kick. */
> -                    nanosleep(ts, NULL);
> -
> -                    /* Out of sleep - either naturally or because of a kick */
> -                    cpu->hvf->sleeping = false;
> -                }
> +                uint64_t seconds = ticks_to_sleep / arm_cpu->gt_cntfrq_hz;
> +                uint64_t nanos =
> +                    (ticks_to_sleep - arm_cpu->gt_cntfrq_hz * seconds) *
> +                    1000000000 / arm_cpu->gt_cntfrq_hz;
> +                struct timespec ts = { seconds, nanos };
> +
> +                /*
> +                 * Use pselect to sleep so that other threads can IPI us while
> +                 * we're sleeping.
> +                 */
> +                qatomic_mb_set(&cpu->thread_kicked, false);
> +                qemu_mutex_unlock_iothread();
> +                pselect(0, 0, 0, 0, &ts, &cpu->hvf->unblock_ipi_mask);
> +                qemu_mutex_lock_iothread();
>   
>                   advance_pc = true;
>               }
>               break;
>           case EC_AA64_HVC:
>               cpu_synchronize_state(cpu);
> -            qemu_mutex_lock_iothread();
>               current_cpu = cpu;
>               if (arm_is_psci_call(arm_cpu, EXCP_HVC)) {
>                   arm_handle_psci_call(arm_cpu);
> @@ -562,11 +520,9 @@ int hvf_vcpu_exec(CPUState *cpu)
>                   DPRINTF("unknown HVC! %016llx", env->xregs[0]);
>                   env->xregs[0] = -1;
>               }
> -            qemu_mutex_unlock_iothread();
>               break;
>           case EC_AA64_SMC:
>               cpu_synchronize_state(cpu);
> -            qemu_mutex_lock_iothread();
>               current_cpu = cpu;
>               if (arm_is_psci_call(arm_cpu, EXCP_SMC)) {
>                   arm_handle_psci_call(arm_cpu);
> @@ -575,7 +531,6 @@ int hvf_vcpu_exec(CPUState *cpu)
>                   env->xregs[0] = -1;
>                   env->pc += 4;
>               }
> -            qemu_mutex_unlock_iothread();
>               break;
>           default:
>               cpu_synchronize_state(cpu);
> @@ -594,6 +549,7 @@ int hvf_vcpu_exec(CPUState *cpu)
>               r = hv_vcpu_set_reg(cpu->hvf->fd, HV_REG_PC, pc);
>               assert_hvf_ok(r);
>           }
> +        qemu_mutex_unlock_iothread();
>       } while (ret == 0);
>   
>       qemu_mutex_lock_iothread();

Re: [PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Peter Collingbourne 3 years, 5 months ago
On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote:
>
> Hi Peter,
>
> On 01.12.20 09:21, Peter Collingbourne wrote:
> > Sleep on WFx until the VTIMER is due but allow ourselves to be woken
> > up on IPI.
> >
> > Signed-off-by: Peter Collingbourne <pcc@google.com>
>
>
> Thanks a bunch!
>
>
> > ---
> > Alexander Graf wrote:
> >> I would love to take a patch from you here :). I'll still be stuck for a
> >> while with the sysreg sync rework that Peter asked for before I can look
> >> at WFI again.
> > Okay, here's a patch :) It's a relatively straightforward adaptation
> > of what we have in our fork, which can now boot Android to GUI while
> > remaining at around 4% CPU when idle.
> >
> > I'm not set up to boot a full Linux distribution at the moment so I
> > tested it on upstream QEMU by running a recent mainline Linux kernel
> > with a rootfs containing an init program that just does sleep(5)
> > and verified that the qemu process remains at low CPU usage during
> > the sleep. This was on top of your v2 plus the last patch of your v1
> > since it doesn't look like you have a replacement for that logic yet.
> >
> >   accel/hvf/hvf-cpus.c     |  5 +--
> >   include/sysemu/hvf_int.h |  3 +-
> >   target/arm/hvf/hvf.c     | 94 +++++++++++-----------------------------
> >   3 files changed, 28 insertions(+), 74 deletions(-)
> >
> > diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
> > index 4360f64671..b2c8fb57f6 100644
> > --- a/accel/hvf/hvf-cpus.c
> > +++ b/accel/hvf/hvf-cpus.c
> > @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
> >       sigact.sa_handler = dummy_signal;
> >       sigaction(SIG_IPI, &sigact, NULL);
> >
> > -    pthread_sigmask(SIG_BLOCK, NULL, &set);
> > -    sigdelset(&set, SIG_IPI);
> > -    pthread_sigmask(SIG_SETMASK, &set, NULL);
> > +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
> > +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
>
>
> What will this do to the x86 hvf implementation? We're now not
> unblocking SIG_IPI again for that, right?

Yes and that was the case before your patch series.

> >
> >   #ifdef __aarch64__
> >       r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
> > diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
> > index c56baa3ae8..13adf6ea77 100644
> > --- a/include/sysemu/hvf_int.h
> > +++ b/include/sysemu/hvf_int.h
> > @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
> >   struct hvf_vcpu_state {
> >       uint64_t fd;
> >       void *exit;
> > -    struct timespec ts;
> > -    bool sleeping;
> > +    sigset_t unblock_ipi_mask;
> >   };
> >
> >   void assert_hvf_ok(hv_return_t ret);
> > diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
> > index 8fe10966d2..60a361ff38 100644
> > --- a/target/arm/hvf/hvf.c
> > +++ b/target/arm/hvf/hvf.c
> > @@ -2,6 +2,7 @@
> >    * QEMU Hypervisor.framework support for Apple Silicon
> >
> >    * Copyright 2020 Alexander Graf <agraf@csgraf.de>
> > + * Copyright 2020 Google LLC
> >    *
> >    * This work is licensed under the terms of the GNU GPL, version 2 or later.
> >    * See the COPYING file in the top-level directory.
> > @@ -18,6 +19,7 @@
> >   #include "sysemu/hw_accel.h"
> >
> >   #include <Hypervisor/Hypervisor.h>
> > +#include <mach/mach_time.h>
> >
> >   #include "exec/address-spaces.h"
> >   #include "hw/irq.h"
> > @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
> >
> >   void hvf_kick_vcpu_thread(CPUState *cpu)
> >   {
> > -    if (cpu->hvf->sleeping) {
> > -        /*
> > -         * When sleeping, make sure we always send signals. Also, clear the
> > -         * timespec, so that an IPI that arrives between setting hvf->sleeping
> > -         * and the nanosleep syscall still aborts the sleep.
> > -         */
> > -        cpu->thread_kicked = false;
> > -        cpu->hvf->ts = (struct timespec){ };
> > -        cpus_kick_thread(cpu);
> > -    } else {
> > -        hv_vcpus_exit(&cpu->hvf->fd, 1);
> > -    }
> > +    cpus_kick_thread(cpu);
> > +    hv_vcpus_exit(&cpu->hvf->fd, 1);
>
>
> This means your first WFI will almost always return immediately due to a
> pending signal, because there probably was an IRQ pending before on the
> same CPU, no?

That's right. Any approach involving the "sleeping" field would need
to be implemented carefully to avoid races that may result in missed
wakeups so for simplicity I just decided to send both kinds of
wakeups. In particular the approach in the updated patch you sent is
racy and I'll elaborate more in the reply to that patch.

> >   }
> >
> >   static int hvf_inject_interrupts(CPUState *cpu)
> > @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu)
> >           uint64_t syndrome = hvf_exit->exception.syndrome;
> >           uint32_t ec = syn_get_ec(syndrome);
> >
> > +        qemu_mutex_lock_iothread();
>
>
> Is there a particular reason you're moving the iothread lock out again
> from the individual bits? I would really like to keep a notion of fast
> path exits.

We still need to lock at least once no matter the exit reason to check
the interrupts so I don't think it's worth it to try and avoid locking
like this. It also makes the implementation easier to reason about and
therefore more likely to be correct. In our implementation we just
stay locked the whole time unless we're in hv_vcpu_run() or pselect().

> >           switch (exit_reason) {
> >           case HV_EXIT_REASON_EXCEPTION:
> >               /* This is the main one, handle below. */
> >               break;
> >           case HV_EXIT_REASON_VTIMER_ACTIVATED:
> > -            qemu_mutex_lock_iothread();
> >               current_cpu = cpu;
> >               qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
> >               qemu_mutex_unlock_iothread();
> >               continue;
> >           case HV_EXIT_REASON_CANCELED:
> >               /* we got kicked, no exit to process */
> > +            qemu_mutex_unlock_iothread();
> >               continue;
> >           default:
> >               assert(0);
> > @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> >               uint32_t srt = (syndrome >> 16) & 0x1f;
> >               uint64_t val = 0;
> >
> > -            qemu_mutex_lock_iothread();
> >               current_cpu = cpu;
> >
> >               DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
> > @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> >                   hvf_set_reg(cpu, srt, val);
> >               }
> >
> > -            qemu_mutex_unlock_iothread();
> > -
> >               advance_pc = true;
> >               break;
> >           }
> > @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu)
> >           case EC_WFX_TRAP:
> >               if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
> >                   (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> > -                uint64_t cval, ctl, val, diff, now;
> > +                uint64_t cval;
> >
> > -                /* Set up a local timer for vtimer if necessary ... */
> > -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
> > -                assert_hvf_ok(r);
> >                   r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
> >                   assert_hvf_ok(r);
> >
> > -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
> > -                diff = cval - val;
> > -
> > -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
> > -                      gt_cntfrq_period_ns(arm_cpu);
> > -
> > -                /* Timer disabled or masked, just wait for long */
> > -                if (!(ctl & 1) || (ctl & 2)) {
> > -                    diff = (120 * NANOSECONDS_PER_SECOND) /
> > -                           gt_cntfrq_period_ns(arm_cpu);
> > +                int64_t ticks_to_sleep = cval - mach_absolute_time();
> > +                if (ticks_to_sleep < 0) {
> > +                    break;
>
>
> This will loop at 100% for Windows, which configures the vtimer as
> cval=0 ctl=7, so with IRQ mask bit set.

Okay, but the 120s is kind of arbitrary so we should just sleep until
we get a signal. That can be done by passing null as the timespec
argument to pselect().

>
>
> Alex
>
>
> >                   }
> >
> > -                if (diff < INT64_MAX) {
> > -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
> > -                    struct timespec *ts = &cpu->hvf->ts;
> > -
> > -                    *ts = (struct timespec){
> > -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
> > -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
> > -                    };
> > -
> > -                    /*
> > -                     * Waking up easily takes 1ms, don't go to sleep for smaller
> > -                     * time periods than 2ms.
> > -                     */
> > -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
>
>
> I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to
> return. Without logic like this, super short WFIs will hurt performance
> quite badly.

I don't think that's accurate. According to this benchmark it's a few
hundred nanoseconds at most.

pcc@pac-mini /tmp> cat pselect.c
#include <signal.h>
#include <sys/select.h>

int main() {
  sigset_t mask, orig_mask;
  pthread_sigmask(SIG_SETMASK, 0, &mask);
  sigaddset(&mask, SIGUSR1);
  pthread_sigmask(SIG_SETMASK, &mask, &orig_mask);

  for (int i = 0; i != 1000000; ++i) {
    struct timespec ts = { 0, 1 };
    pselect(0, 0, 0, 0, &ts, &orig_mask);
  }
}
pcc@pac-mini /tmp> time ./pselect

________________________________________________________
Executed in  179.87 millis    fish           external
   usr time   77.68 millis   57.00 micros   77.62 millis
   sys time  101.37 millis  852.00 micros  100.52 millis

Besides, all that you're really saving here is the single pselect
call. There are no doubt more expensive syscalls involved in exiting
and entering the VCPU that would dominate here.

Peter

>
>
> Alex
>
> > -                        advance_pc = true;
> > -                        break;
> > -                    }
> > -
> > -                    /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */
> > -                    cpu->hvf->sleeping = true;
> > -                    smp_mb();
> > -
> > -                    /* Bail out if we received an IRQ meanwhile */
> > -                    if (cpu->thread_kicked || (cpu->interrupt_request &
> > -                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> > -                        cpu->hvf->sleeping = false;
> > -                        break;
> > -                    }
> > -
> > -                    /* nanosleep returns on signal, so we wake up on kick. */
> > -                    nanosleep(ts, NULL);
> > -
> > -                    /* Out of sleep - either naturally or because of a kick */
> > -                    cpu->hvf->sleeping = false;
> > -                }
> > +                uint64_t seconds = ticks_to_sleep / arm_cpu->gt_cntfrq_hz;
> > +                uint64_t nanos =
> > +                    (ticks_to_sleep - arm_cpu->gt_cntfrq_hz * seconds) *
> > +                    1000000000 / arm_cpu->gt_cntfrq_hz;
> > +                struct timespec ts = { seconds, nanos };
> > +
> > +                /*
> > +                 * Use pselect to sleep so that other threads can IPI us while
> > +                 * we're sleeping.
> > +                 */
> > +                qatomic_mb_set(&cpu->thread_kicked, false);
> > +                qemu_mutex_unlock_iothread();
> > +                pselect(0, 0, 0, 0, &ts, &cpu->hvf->unblock_ipi_mask);
> > +                qemu_mutex_lock_iothread();
> >
> >                   advance_pc = true;
> >               }
> >               break;
> >           case EC_AA64_HVC:
> >               cpu_synchronize_state(cpu);
> > -            qemu_mutex_lock_iothread();
> >               current_cpu = cpu;
> >               if (arm_is_psci_call(arm_cpu, EXCP_HVC)) {
> >                   arm_handle_psci_call(arm_cpu);
> > @@ -562,11 +520,9 @@ int hvf_vcpu_exec(CPUState *cpu)
> >                   DPRINTF("unknown HVC! %016llx", env->xregs[0]);
> >                   env->xregs[0] = -1;
> >               }
> > -            qemu_mutex_unlock_iothread();
> >               break;
> >           case EC_AA64_SMC:
> >               cpu_synchronize_state(cpu);
> > -            qemu_mutex_lock_iothread();
> >               current_cpu = cpu;
> >               if (arm_is_psci_call(arm_cpu, EXCP_SMC)) {
> >                   arm_handle_psci_call(arm_cpu);
> > @@ -575,7 +531,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> >                   env->xregs[0] = -1;
> >                   env->pc += 4;
> >               }
> > -            qemu_mutex_unlock_iothread();
> >               break;
> >           default:
> >               cpu_synchronize_state(cpu);
> > @@ -594,6 +549,7 @@ int hvf_vcpu_exec(CPUState *cpu)
> >               r = hv_vcpu_set_reg(cpu->hvf->fd, HV_REG_PC, pc);
> >               assert_hvf_ok(r);
> >           }
> > +        qemu_mutex_unlock_iothread();
> >       } while (ret == 0);
> >
> >       qemu_mutex_lock_iothread();

Re: [PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Alexander Graf 3 years, 5 months ago
On 01.12.20 19:59, Peter Collingbourne wrote:
> On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote:
>> Hi Peter,
>>
>> On 01.12.20 09:21, Peter Collingbourne wrote:
>>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
>>> up on IPI.
>>>
>>> Signed-off-by: Peter Collingbourne <pcc@google.com>
>>
>> Thanks a bunch!
>>
>>
>>> ---
>>> Alexander Graf wrote:
>>>> I would love to take a patch from you here :). I'll still be stuck for a
>>>> while with the sysreg sync rework that Peter asked for before I can look
>>>> at WFI again.
>>> Okay, here's a patch :) It's a relatively straightforward adaptation
>>> of what we have in our fork, which can now boot Android to GUI while
>>> remaining at around 4% CPU when idle.
>>>
>>> I'm not set up to boot a full Linux distribution at the moment so I
>>> tested it on upstream QEMU by running a recent mainline Linux kernel
>>> with a rootfs containing an init program that just does sleep(5)
>>> and verified that the qemu process remains at low CPU usage during
>>> the sleep. This was on top of your v2 plus the last patch of your v1
>>> since it doesn't look like you have a replacement for that logic yet.
>>>
>>>    accel/hvf/hvf-cpus.c     |  5 +--
>>>    include/sysemu/hvf_int.h |  3 +-
>>>    target/arm/hvf/hvf.c     | 94 +++++++++++-----------------------------
>>>    3 files changed, 28 insertions(+), 74 deletions(-)
>>>
>>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
>>> index 4360f64671..b2c8fb57f6 100644
>>> --- a/accel/hvf/hvf-cpus.c
>>> +++ b/accel/hvf/hvf-cpus.c
>>> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
>>>        sigact.sa_handler = dummy_signal;
>>>        sigaction(SIG_IPI, &sigact, NULL);
>>>
>>> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
>>> -    sigdelset(&set, SIG_IPI);
>>> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
>>> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
>>> +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
>>
>> What will this do to the x86 hvf implementation? We're now not
>> unblocking SIG_IPI again for that, right?
> Yes and that was the case before your patch series.


The way I understand Roman, he wanted to unblock the IPI signal on x86:

https://patchwork.kernel.org/project/qemu-devel/patch/20201126215017.41156-3-agraf@csgraf.de/#23807021

I agree that at this point it's not a problem though to break it again. 
I'm not quite sure how to merge your patches within my patch set though, 
given they basically revert half of my previously introduced code...


>
>>>    #ifdef __aarch64__
>>>        r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
>>> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
>>> index c56baa3ae8..13adf6ea77 100644
>>> --- a/include/sysemu/hvf_int.h
>>> +++ b/include/sysemu/hvf_int.h
>>> @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
>>>    struct hvf_vcpu_state {
>>>        uint64_t fd;
>>>        void *exit;
>>> -    struct timespec ts;
>>> -    bool sleeping;
>>> +    sigset_t unblock_ipi_mask;
>>>    };
>>>
>>>    void assert_hvf_ok(hv_return_t ret);
>>> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
>>> index 8fe10966d2..60a361ff38 100644
>>> --- a/target/arm/hvf/hvf.c
>>> +++ b/target/arm/hvf/hvf.c
>>> @@ -2,6 +2,7 @@
>>>     * QEMU Hypervisor.framework support for Apple Silicon
>>>
>>>     * Copyright 2020 Alexander Graf <agraf@csgraf.de>
>>> + * Copyright 2020 Google LLC
>>>     *
>>>     * This work is licensed under the terms of the GNU GPL, version 2 or later.
>>>     * See the COPYING file in the top-level directory.
>>> @@ -18,6 +19,7 @@
>>>    #include "sysemu/hw_accel.h"
>>>
>>>    #include <Hypervisor/Hypervisor.h>
>>> +#include <mach/mach_time.h>
>>>
>>>    #include "exec/address-spaces.h"
>>>    #include "hw/irq.h"
>>> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
>>>
>>>    void hvf_kick_vcpu_thread(CPUState *cpu)
>>>    {
>>> -    if (cpu->hvf->sleeping) {
>>> -        /*
>>> -         * When sleeping, make sure we always send signals. Also, clear the
>>> -         * timespec, so that an IPI that arrives between setting hvf->sleeping
>>> -         * and the nanosleep syscall still aborts the sleep.
>>> -         */
>>> -        cpu->thread_kicked = false;
>>> -        cpu->hvf->ts = (struct timespec){ };
>>> -        cpus_kick_thread(cpu);
>>> -    } else {
>>> -        hv_vcpus_exit(&cpu->hvf->fd, 1);
>>> -    }
>>> +    cpus_kick_thread(cpu);
>>> +    hv_vcpus_exit(&cpu->hvf->fd, 1);
>>
>> This means your first WFI will almost always return immediately due to a
>> pending signal, because there probably was an IRQ pending before on the
>> same CPU, no?
> That's right. Any approach involving the "sleeping" field would need
> to be implemented carefully to avoid races that may result in missed
> wakeups so for simplicity I just decided to send both kinds of
> wakeups. In particular the approach in the updated patch you sent is
> racy and I'll elaborate more in the reply to that patch.
>
>>>    }
>>>
>>>    static int hvf_inject_interrupts(CPUState *cpu)
>>> @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>            uint64_t syndrome = hvf_exit->exception.syndrome;
>>>            uint32_t ec = syn_get_ec(syndrome);
>>>
>>> +        qemu_mutex_lock_iothread();
>>
>> Is there a particular reason you're moving the iothread lock out again
>> from the individual bits? I would really like to keep a notion of fast
>> path exits.
> We still need to lock at least once no matter the exit reason to check
> the interrupts so I don't think it's worth it to try and avoid locking
> like this. It also makes the implementation easier to reason about and
> therefore more likely to be correct. In our implementation we just
> stay locked the whole time unless we're in hv_vcpu_run() or pselect().
>
>>>            switch (exit_reason) {
>>>            case HV_EXIT_REASON_EXCEPTION:
>>>                /* This is the main one, handle below. */
>>>                break;
>>>            case HV_EXIT_REASON_VTIMER_ACTIVATED:
>>> -            qemu_mutex_lock_iothread();
>>>                current_cpu = cpu;
>>>                qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
>>>                qemu_mutex_unlock_iothread();
>>>                continue;
>>>            case HV_EXIT_REASON_CANCELED:
>>>                /* we got kicked, no exit to process */
>>> +            qemu_mutex_unlock_iothread();
>>>                continue;
>>>            default:
>>>                assert(0);
>>> @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>                uint32_t srt = (syndrome >> 16) & 0x1f;
>>>                uint64_t val = 0;
>>>
>>> -            qemu_mutex_lock_iothread();
>>>                current_cpu = cpu;
>>>
>>>                DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
>>> @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>                    hvf_set_reg(cpu, srt, val);
>>>                }
>>>
>>> -            qemu_mutex_unlock_iothread();
>>> -
>>>                advance_pc = true;
>>>                break;
>>>            }
>>> @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>            case EC_WFX_TRAP:
>>>                if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
>>>                    (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
>>> -                uint64_t cval, ctl, val, diff, now;
>>> +                uint64_t cval;
>>>
>>> -                /* Set up a local timer for vtimer if necessary ... */
>>> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
>>> -                assert_hvf_ok(r);
>>>                    r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
>>>                    assert_hvf_ok(r);
>>>
>>> -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
>>> -                diff = cval - val;
>>> -
>>> -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
>>> -                      gt_cntfrq_period_ns(arm_cpu);
>>> -
>>> -                /* Timer disabled or masked, just wait for long */
>>> -                if (!(ctl & 1) || (ctl & 2)) {
>>> -                    diff = (120 * NANOSECONDS_PER_SECOND) /
>>> -                           gt_cntfrq_period_ns(arm_cpu);
>>> +                int64_t ticks_to_sleep = cval - mach_absolute_time();
>>> +                if (ticks_to_sleep < 0) {
>>> +                    break;
>>
>> This will loop at 100% for Windows, which configures the vtimer as
>> cval=0 ctl=7, so with IRQ mask bit set.
> Okay, but the 120s is kind of arbitrary so we should just sleep until
> we get a signal. That can be done by passing null as the timespec
> argument to pselect().


The reason I capped it at 120s was so that if I do hit a race, you don't 
break everything forever. Only for 2 minutes :).


>
>>
>> Alex
>>
>>
>>>                    }
>>>
>>> -                if (diff < INT64_MAX) {
>>> -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
>>> -                    struct timespec *ts = &cpu->hvf->ts;
>>> -
>>> -                    *ts = (struct timespec){
>>> -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
>>> -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
>>> -                    };
>>> -
>>> -                    /*
>>> -                     * Waking up easily takes 1ms, don't go to sleep for smaller
>>> -                     * time periods than 2ms.
>>> -                     */
>>> -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
>>
>> I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to
>> return. Without logic like this, super short WFIs will hurt performance
>> quite badly.
> I don't think that's accurate. According to this benchmark it's a few
> hundred nanoseconds at most.
>
> pcc@pac-mini /tmp> cat pselect.c
> #include <signal.h>
> #include <sys/select.h>
>
> int main() {
>    sigset_t mask, orig_mask;
>    pthread_sigmask(SIG_SETMASK, 0, &mask);
>    sigaddset(&mask, SIGUSR1);
>    pthread_sigmask(SIG_SETMASK, &mask, &orig_mask);
>
>    for (int i = 0; i != 1000000; ++i) {
>      struct timespec ts = { 0, 1 };
>      pselect(0, 0, 0, 0, &ts, &orig_mask);
>    }
> }
> pcc@pac-mini /tmp> time ./pselect
>
> ________________________________________________________
> Executed in  179.87 millis    fish           external
>     usr time   77.68 millis   57.00 micros   77.62 millis
>     sys time  101.37 millis  852.00 micros  100.52 millis
>
> Besides, all that you're really saving here is the single pselect
> call. There are no doubt more expensive syscalls involved in exiting
> and entering the VCPU that would dominate here.


I would expect that such a super low ts value has a short-circuit path 
in the kernel as well. Where things start to fall apart is when you're 
at a threshold where rescheduling might be ok, but then you need to take 
all of the additional task switch overhead into account. Try to adapt 
your test code a bit:

#include <signal.h>
#include <sys/select.h>

int main() {
   sigset_t mask, orig_mask;
   pthread_sigmask(SIG_SETMASK, 0, &mask);
   sigaddset(&mask, SIGUSR1);
   pthread_sigmask(SIG_SETMASK, &mask, &orig_mask);

   for (int i = 0; i != 10000; ++i) {
#define SCALE_MS 1000000
     struct timespec ts = { 0, SCALE_MS / 10 };
     pselect(0, 0, 0, 0, &ts, &orig_mask);
   }
}


% time ./pselect
./pselect  0.00s user 0.01s system 1% cpu 1.282 total

You're suddenly seeing 300µs overhead per pselect call then. When I 
measured actual enter/exit times in QEMU, I saw much bigger differences 
between "time I want to sleep for" and "time I did sleep" even when just 
capturing the virtual time before and after the nanosleep/pselect call.


Alex



Re: [PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Peter Collingbourne 3 years, 5 months ago
On Tue, Dec 1, 2020 at 2:04 PM Alexander Graf <agraf@csgraf.de> wrote:
>
>
> On 01.12.20 19:59, Peter Collingbourne wrote:
> > On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote:
> >> Hi Peter,
> >>
> >> On 01.12.20 09:21, Peter Collingbourne wrote:
> >>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
> >>> up on IPI.
> >>>
> >>> Signed-off-by: Peter Collingbourne <pcc@google.com>
> >>
> >> Thanks a bunch!
> >>
> >>
> >>> ---
> >>> Alexander Graf wrote:
> >>>> I would love to take a patch from you here :). I'll still be stuck for a
> >>>> while with the sysreg sync rework that Peter asked for before I can look
> >>>> at WFI again.
> >>> Okay, here's a patch :) It's a relatively straightforward adaptation
> >>> of what we have in our fork, which can now boot Android to GUI while
> >>> remaining at around 4% CPU when idle.
> >>>
> >>> I'm not set up to boot a full Linux distribution at the moment so I
> >>> tested it on upstream QEMU by running a recent mainline Linux kernel
> >>> with a rootfs containing an init program that just does sleep(5)
> >>> and verified that the qemu process remains at low CPU usage during
> >>> the sleep. This was on top of your v2 plus the last patch of your v1
> >>> since it doesn't look like you have a replacement for that logic yet.
> >>>
> >>>    accel/hvf/hvf-cpus.c     |  5 +--
> >>>    include/sysemu/hvf_int.h |  3 +-
> >>>    target/arm/hvf/hvf.c     | 94 +++++++++++-----------------------------
> >>>    3 files changed, 28 insertions(+), 74 deletions(-)
> >>>
> >>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
> >>> index 4360f64671..b2c8fb57f6 100644
> >>> --- a/accel/hvf/hvf-cpus.c
> >>> +++ b/accel/hvf/hvf-cpus.c
> >>> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
> >>>        sigact.sa_handler = dummy_signal;
> >>>        sigaction(SIG_IPI, &sigact, NULL);
> >>>
> >>> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
> >>> -    sigdelset(&set, SIG_IPI);
> >>> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
> >>> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
> >>> +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
> >>
> >> What will this do to the x86 hvf implementation? We're now not
> >> unblocking SIG_IPI again for that, right?
> > Yes and that was the case before your patch series.
>
>
> The way I understand Roman, he wanted to unblock the IPI signal on x86:
>
> https://patchwork.kernel.org/project/qemu-devel/patch/20201126215017.41156-3-agraf@csgraf.de/#23807021
>
> I agree that at this point it's not a problem though to break it again.
> I'm not quite sure how to merge your patches within my patch set though,
> given they basically revert half of my previously introduced code...
>
>
> >
> >>>    #ifdef __aarch64__
> >>>        r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
> >>> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
> >>> index c56baa3ae8..13adf6ea77 100644
> >>> --- a/include/sysemu/hvf_int.h
> >>> +++ b/include/sysemu/hvf_int.h
> >>> @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
> >>>    struct hvf_vcpu_state {
> >>>        uint64_t fd;
> >>>        void *exit;
> >>> -    struct timespec ts;
> >>> -    bool sleeping;
> >>> +    sigset_t unblock_ipi_mask;
> >>>    };
> >>>
> >>>    void assert_hvf_ok(hv_return_t ret);
> >>> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
> >>> index 8fe10966d2..60a361ff38 100644
> >>> --- a/target/arm/hvf/hvf.c
> >>> +++ b/target/arm/hvf/hvf.c
> >>> @@ -2,6 +2,7 @@
> >>>     * QEMU Hypervisor.framework support for Apple Silicon
> >>>
> >>>     * Copyright 2020 Alexander Graf <agraf@csgraf.de>
> >>> + * Copyright 2020 Google LLC
> >>>     *
> >>>     * This work is licensed under the terms of the GNU GPL, version 2 or later.
> >>>     * See the COPYING file in the top-level directory.
> >>> @@ -18,6 +19,7 @@
> >>>    #include "sysemu/hw_accel.h"
> >>>
> >>>    #include <Hypervisor/Hypervisor.h>
> >>> +#include <mach/mach_time.h>
> >>>
> >>>    #include "exec/address-spaces.h"
> >>>    #include "hw/irq.h"
> >>> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
> >>>
> >>>    void hvf_kick_vcpu_thread(CPUState *cpu)
> >>>    {
> >>> -    if (cpu->hvf->sleeping) {
> >>> -        /*
> >>> -         * When sleeping, make sure we always send signals. Also, clear the
> >>> -         * timespec, so that an IPI that arrives between setting hvf->sleeping
> >>> -         * and the nanosleep syscall still aborts the sleep.
> >>> -         */
> >>> -        cpu->thread_kicked = false;
> >>> -        cpu->hvf->ts = (struct timespec){ };
> >>> -        cpus_kick_thread(cpu);
> >>> -    } else {
> >>> -        hv_vcpus_exit(&cpu->hvf->fd, 1);
> >>> -    }
> >>> +    cpus_kick_thread(cpu);
> >>> +    hv_vcpus_exit(&cpu->hvf->fd, 1);
> >>
> >> This means your first WFI will almost always return immediately due to a
> >> pending signal, because there probably was an IRQ pending before on the
> >> same CPU, no?
> > That's right. Any approach involving the "sleeping" field would need
> > to be implemented carefully to avoid races that may result in missed
> > wakeups so for simplicity I just decided to send both kinds of
> > wakeups. In particular the approach in the updated patch you sent is
> > racy and I'll elaborate more in the reply to that patch.
> >
> >>>    }
> >>>
> >>>    static int hvf_inject_interrupts(CPUState *cpu)
> >>> @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>            uint64_t syndrome = hvf_exit->exception.syndrome;
> >>>            uint32_t ec = syn_get_ec(syndrome);
> >>>
> >>> +        qemu_mutex_lock_iothread();
> >>
> >> Is there a particular reason you're moving the iothread lock out again
> >> from the individual bits? I would really like to keep a notion of fast
> >> path exits.
> > We still need to lock at least once no matter the exit reason to check
> > the interrupts so I don't think it's worth it to try and avoid locking
> > like this. It also makes the implementation easier to reason about and
> > therefore more likely to be correct. In our implementation we just
> > stay locked the whole time unless we're in hv_vcpu_run() or pselect().
> >
> >>>            switch (exit_reason) {
> >>>            case HV_EXIT_REASON_EXCEPTION:
> >>>                /* This is the main one, handle below. */
> >>>                break;
> >>>            case HV_EXIT_REASON_VTIMER_ACTIVATED:
> >>> -            qemu_mutex_lock_iothread();
> >>>                current_cpu = cpu;
> >>>                qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
> >>>                qemu_mutex_unlock_iothread();
> >>>                continue;
> >>>            case HV_EXIT_REASON_CANCELED:
> >>>                /* we got kicked, no exit to process */
> >>> +            qemu_mutex_unlock_iothread();
> >>>                continue;
> >>>            default:
> >>>                assert(0);
> >>> @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>                uint32_t srt = (syndrome >> 16) & 0x1f;
> >>>                uint64_t val = 0;
> >>>
> >>> -            qemu_mutex_lock_iothread();
> >>>                current_cpu = cpu;
> >>>
> >>>                DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
> >>> @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>                    hvf_set_reg(cpu, srt, val);
> >>>                }
> >>>
> >>> -            qemu_mutex_unlock_iothread();
> >>> -
> >>>                advance_pc = true;
> >>>                break;
> >>>            }
> >>> @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>            case EC_WFX_TRAP:
> >>>                if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
> >>>                    (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> >>> -                uint64_t cval, ctl, val, diff, now;
> >>> +                uint64_t cval;
> >>>
> >>> -                /* Set up a local timer for vtimer if necessary ... */
> >>> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
> >>> -                assert_hvf_ok(r);
> >>>                    r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
> >>>                    assert_hvf_ok(r);
> >>>
> >>> -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
> >>> -                diff = cval - val;
> >>> -
> >>> -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
> >>> -                      gt_cntfrq_period_ns(arm_cpu);
> >>> -
> >>> -                /* Timer disabled or masked, just wait for long */
> >>> -                if (!(ctl & 1) || (ctl & 2)) {
> >>> -                    diff = (120 * NANOSECONDS_PER_SECOND) /
> >>> -                           gt_cntfrq_period_ns(arm_cpu);
> >>> +                int64_t ticks_to_sleep = cval - mach_absolute_time();
> >>> +                if (ticks_to_sleep < 0) {
> >>> +                    break;
> >>
> >> This will loop at 100% for Windows, which configures the vtimer as
> >> cval=0 ctl=7, so with IRQ mask bit set.
> > Okay, but the 120s is kind of arbitrary so we should just sleep until
> > we get a signal. That can be done by passing null as the timespec
> > argument to pselect().
>
>
> The reason I capped it at 120s was so that if I do hit a race, you don't
> break everything forever. Only for 2 minutes :).

I see. I think at this point we want to notice these types of bugs if
they exist instead of hiding them, so I would mildly be in favor of
not capping at 120s.

> >
> >>
> >> Alex
> >>
> >>
> >>>                    }
> >>>
> >>> -                if (diff < INT64_MAX) {
> >>> -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
> >>> -                    struct timespec *ts = &cpu->hvf->ts;
> >>> -
> >>> -                    *ts = (struct timespec){
> >>> -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
> >>> -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
> >>> -                    };
> >>> -
> >>> -                    /*
> >>> -                     * Waking up easily takes 1ms, don't go to sleep for smaller
> >>> -                     * time periods than 2ms.
> >>> -                     */
> >>> -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
> >>
> >> I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to
> >> return. Without logic like this, super short WFIs will hurt performance
> >> quite badly.
> > I don't think that's accurate. According to this benchmark it's a few
> > hundred nanoseconds at most.
> >
> > pcc@pac-mini /tmp> cat pselect.c
> > #include <signal.h>
> > #include <sys/select.h>
> >
> > int main() {
> >    sigset_t mask, orig_mask;
> >    pthread_sigmask(SIG_SETMASK, 0, &mask);
> >    sigaddset(&mask, SIGUSR1);
> >    pthread_sigmask(SIG_SETMASK, &mask, &orig_mask);
> >
> >    for (int i = 0; i != 1000000; ++i) {
> >      struct timespec ts = { 0, 1 };
> >      pselect(0, 0, 0, 0, &ts, &orig_mask);
> >    }
> > }
> > pcc@pac-mini /tmp> time ./pselect
> >
> > ________________________________________________________
> > Executed in  179.87 millis    fish           external
> >     usr time   77.68 millis   57.00 micros   77.62 millis
> >     sys time  101.37 millis  852.00 micros  100.52 millis
> >
> > Besides, all that you're really saving here is the single pselect
> > call. There are no doubt more expensive syscalls involved in exiting
> > and entering the VCPU that would dominate here.
>
>
> I would expect that such a super low ts value has a short-circuit path
> in the kernel as well. Where things start to fall apart is when you're
> at a threshold where rescheduling might be ok, but then you need to take
> all of the additional task switch overhead into account. Try to adapt
> your test code a bit:
>
> #include <signal.h>
> #include <sys/select.h>
>
> int main() {
>    sigset_t mask, orig_mask;
>    pthread_sigmask(SIG_SETMASK, 0, &mask);
>    sigaddset(&mask, SIGUSR1);
>    pthread_sigmask(SIG_SETMASK, &mask, &orig_mask);
>
>    for (int i = 0; i != 10000; ++i) {
> #define SCALE_MS 1000000
>      struct timespec ts = { 0, SCALE_MS / 10 };
>      pselect(0, 0, 0, 0, &ts, &orig_mask);
>    }
> }
>
>
> % time ./pselect
> ./pselect  0.00s user 0.01s system 1% cpu 1.282 total
>
> You're suddenly seeing 300µs overhead per pselect call then. When I
> measured actual enter/exit times in QEMU, I saw much bigger differences
> between "time I want to sleep for" and "time I did sleep" even when just
> capturing the virtual time before and after the nanosleep/pselect call.

Okay. So the alternative is that we spin on the CPU, either doing
no-op VCPU entries/exits or something like:

while (mach_absolute_time() < cval);

My intuition is we shouldn't try to subvert the OS scheduler like this
unless it's proven to help with some real world metric since otherwise
we're not being fair to the other processes on the CPU. With CPU
intensive workloads I wouldn't expect these kinds of sleeps to happen
very often if at all so if it's only microbenchmarks and so on that
are affected then my inclination is not to do this for now.

Peter

Re: [PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Alexander Graf 3 years, 5 months ago
On 02.12.20 02:19, Peter Collingbourne wrote:
> On Tue, Dec 1, 2020 at 2:04 PM Alexander Graf <agraf@csgraf.de> wrote:
>>
>> On 01.12.20 19:59, Peter Collingbourne wrote:
>>> On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote:
>>>> Hi Peter,
>>>>
>>>> On 01.12.20 09:21, Peter Collingbourne wrote:
>>>>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
>>>>> up on IPI.
>>>>>
>>>>> Signed-off-by: Peter Collingbourne <pcc@google.com>
>>>> Thanks a bunch!
>>>>
>>>>
>>>>> ---
>>>>> Alexander Graf wrote:
>>>>>> I would love to take a patch from you here :). I'll still be stuck for a
>>>>>> while with the sysreg sync rework that Peter asked for before I can look
>>>>>> at WFI again.
>>>>> Okay, here's a patch :) It's a relatively straightforward adaptation
>>>>> of what we have in our fork, which can now boot Android to GUI while
>>>>> remaining at around 4% CPU when idle.
>>>>>
>>>>> I'm not set up to boot a full Linux distribution at the moment so I
>>>>> tested it on upstream QEMU by running a recent mainline Linux kernel
>>>>> with a rootfs containing an init program that just does sleep(5)
>>>>> and verified that the qemu process remains at low CPU usage during
>>>>> the sleep. This was on top of your v2 plus the last patch of your v1
>>>>> since it doesn't look like you have a replacement for that logic yet.
>>>>>
>>>>>     accel/hvf/hvf-cpus.c     |  5 +--
>>>>>     include/sysemu/hvf_int.h |  3 +-
>>>>>     target/arm/hvf/hvf.c     | 94 +++++++++++-----------------------------
>>>>>     3 files changed, 28 insertions(+), 74 deletions(-)
>>>>>
>>>>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
>>>>> index 4360f64671..b2c8fb57f6 100644
>>>>> --- a/accel/hvf/hvf-cpus.c
>>>>> +++ b/accel/hvf/hvf-cpus.c
>>>>> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
>>>>>         sigact.sa_handler = dummy_signal;
>>>>>         sigaction(SIG_IPI, &sigact, NULL);
>>>>>
>>>>> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
>>>>> -    sigdelset(&set, SIG_IPI);
>>>>> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
>>>>> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
>>>>> +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
>>>> What will this do to the x86 hvf implementation? We're now not
>>>> unblocking SIG_IPI again for that, right?
>>> Yes and that was the case before your patch series.
>>
>> The way I understand Roman, he wanted to unblock the IPI signal on x86:
>>
>> https://patchwork.kernel.org/project/qemu-devel/patch/20201126215017.41156-3-agraf@csgraf.de/#23807021
>>
>> I agree that at this point it's not a problem though to break it again.
>> I'm not quite sure how to merge your patches within my patch set though,
>> given they basically revert half of my previously introduced code...
>>
>>
>>>>>     #ifdef __aarch64__
>>>>>         r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
>>>>> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
>>>>> index c56baa3ae8..13adf6ea77 100644
>>>>> --- a/include/sysemu/hvf_int.h
>>>>> +++ b/include/sysemu/hvf_int.h
>>>>> @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
>>>>>     struct hvf_vcpu_state {
>>>>>         uint64_t fd;
>>>>>         void *exit;
>>>>> -    struct timespec ts;
>>>>> -    bool sleeping;
>>>>> +    sigset_t unblock_ipi_mask;
>>>>>     };
>>>>>
>>>>>     void assert_hvf_ok(hv_return_t ret);
>>>>> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
>>>>> index 8fe10966d2..60a361ff38 100644
>>>>> --- a/target/arm/hvf/hvf.c
>>>>> +++ b/target/arm/hvf/hvf.c
>>>>> @@ -2,6 +2,7 @@
>>>>>      * QEMU Hypervisor.framework support for Apple Silicon
>>>>>
>>>>>      * Copyright 2020 Alexander Graf <agraf@csgraf.de>
>>>>> + * Copyright 2020 Google LLC
>>>>>      *
>>>>>      * This work is licensed under the terms of the GNU GPL, version 2 or later.
>>>>>      * See the COPYING file in the top-level directory.
>>>>> @@ -18,6 +19,7 @@
>>>>>     #include "sysemu/hw_accel.h"
>>>>>
>>>>>     #include <Hypervisor/Hypervisor.h>
>>>>> +#include <mach/mach_time.h>
>>>>>
>>>>>     #include "exec/address-spaces.h"
>>>>>     #include "hw/irq.h"
>>>>> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
>>>>>
>>>>>     void hvf_kick_vcpu_thread(CPUState *cpu)
>>>>>     {
>>>>> -    if (cpu->hvf->sleeping) {
>>>>> -        /*
>>>>> -         * When sleeping, make sure we always send signals. Also, clear the
>>>>> -         * timespec, so that an IPI that arrives between setting hvf->sleeping
>>>>> -         * and the nanosleep syscall still aborts the sleep.
>>>>> -         */
>>>>> -        cpu->thread_kicked = false;
>>>>> -        cpu->hvf->ts = (struct timespec){ };
>>>>> -        cpus_kick_thread(cpu);
>>>>> -    } else {
>>>>> -        hv_vcpus_exit(&cpu->hvf->fd, 1);
>>>>> -    }
>>>>> +    cpus_kick_thread(cpu);
>>>>> +    hv_vcpus_exit(&cpu->hvf->fd, 1);
>>>> This means your first WFI will almost always return immediately due to a
>>>> pending signal, because there probably was an IRQ pending before on the
>>>> same CPU, no?
>>> That's right. Any approach involving the "sleeping" field would need
>>> to be implemented carefully to avoid races that may result in missed
>>> wakeups so for simplicity I just decided to send both kinds of
>>> wakeups. In particular the approach in the updated patch you sent is
>>> racy and I'll elaborate more in the reply to that patch.
>>>
>>>>>     }
>>>>>
>>>>>     static int hvf_inject_interrupts(CPUState *cpu)
>>>>> @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>>>             uint64_t syndrome = hvf_exit->exception.syndrome;
>>>>>             uint32_t ec = syn_get_ec(syndrome);
>>>>>
>>>>> +        qemu_mutex_lock_iothread();
>>>> Is there a particular reason you're moving the iothread lock out again
>>>> from the individual bits? I would really like to keep a notion of fast
>>>> path exits.
>>> We still need to lock at least once no matter the exit reason to check
>>> the interrupts so I don't think it's worth it to try and avoid locking
>>> like this. It also makes the implementation easier to reason about and
>>> therefore more likely to be correct. In our implementation we just
>>> stay locked the whole time unless we're in hv_vcpu_run() or pselect().
>>>
>>>>>             switch (exit_reason) {
>>>>>             case HV_EXIT_REASON_EXCEPTION:
>>>>>                 /* This is the main one, handle below. */
>>>>>                 break;
>>>>>             case HV_EXIT_REASON_VTIMER_ACTIVATED:
>>>>> -            qemu_mutex_lock_iothread();
>>>>>                 current_cpu = cpu;
>>>>>                 qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
>>>>>                 qemu_mutex_unlock_iothread();
>>>>>                 continue;
>>>>>             case HV_EXIT_REASON_CANCELED:
>>>>>                 /* we got kicked, no exit to process */
>>>>> +            qemu_mutex_unlock_iothread();
>>>>>                 continue;
>>>>>             default:
>>>>>                 assert(0);
>>>>> @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>>>                 uint32_t srt = (syndrome >> 16) & 0x1f;
>>>>>                 uint64_t val = 0;
>>>>>
>>>>> -            qemu_mutex_lock_iothread();
>>>>>                 current_cpu = cpu;
>>>>>
>>>>>                 DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
>>>>> @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>>>                     hvf_set_reg(cpu, srt, val);
>>>>>                 }
>>>>>
>>>>> -            qemu_mutex_unlock_iothread();
>>>>> -
>>>>>                 advance_pc = true;
>>>>>                 break;
>>>>>             }
>>>>> @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>>>             case EC_WFX_TRAP:
>>>>>                 if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
>>>>>                     (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
>>>>> -                uint64_t cval, ctl, val, diff, now;
>>>>> +                uint64_t cval;
>>>>>
>>>>> -                /* Set up a local timer for vtimer if necessary ... */
>>>>> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
>>>>> -                assert_hvf_ok(r);
>>>>>                     r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
>>>>>                     assert_hvf_ok(r);
>>>>>
>>>>> -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
>>>>> -                diff = cval - val;
>>>>> -
>>>>> -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
>>>>> -                      gt_cntfrq_period_ns(arm_cpu);
>>>>> -
>>>>> -                /* Timer disabled or masked, just wait for long */
>>>>> -                if (!(ctl & 1) || (ctl & 2)) {
>>>>> -                    diff = (120 * NANOSECONDS_PER_SECOND) /
>>>>> -                           gt_cntfrq_period_ns(arm_cpu);
>>>>> +                int64_t ticks_to_sleep = cval - mach_absolute_time();
>>>>> +                if (ticks_to_sleep < 0) {
>>>>> +                    break;
>>>> This will loop at 100% for Windows, which configures the vtimer as
>>>> cval=0 ctl=7, so with IRQ mask bit set.
>>> Okay, but the 120s is kind of arbitrary so we should just sleep until
>>> we get a signal. That can be done by passing null as the timespec
>>> argument to pselect().
>>
>> The reason I capped it at 120s was so that if I do hit a race, you don't
>> break everything forever. Only for 2 minutes :).
> I see. I think at this point we want to notice these types of bugs if
> they exist instead of hiding them, so I would mildly be in favor of
> not capping at 120s.


Crossing my fingers that we are at that point already :).


>
>>>> Alex
>>>>
>>>>
>>>>>                     }
>>>>>
>>>>> -                if (diff < INT64_MAX) {
>>>>> -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
>>>>> -                    struct timespec *ts = &cpu->hvf->ts;
>>>>> -
>>>>> -                    *ts = (struct timespec){
>>>>> -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
>>>>> -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
>>>>> -                    };
>>>>> -
>>>>> -                    /*
>>>>> -                     * Waking up easily takes 1ms, don't go to sleep for smaller
>>>>> -                     * time periods than 2ms.
>>>>> -                     */
>>>>> -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
>>>> I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to
>>>> return. Without logic like this, super short WFIs will hurt performance
>>>> quite badly.
>>> I don't think that's accurate. According to this benchmark it's a few
>>> hundred nanoseconds at most.
>>>
>>> pcc@pac-mini /tmp> cat pselect.c
>>> #include <signal.h>
>>> #include <sys/select.h>
>>>
>>> int main() {
>>>     sigset_t mask, orig_mask;
>>>     pthread_sigmask(SIG_SETMASK, 0, &mask);
>>>     sigaddset(&mask, SIGUSR1);
>>>     pthread_sigmask(SIG_SETMASK, &mask, &orig_mask);
>>>
>>>     for (int i = 0; i != 1000000; ++i) {
>>>       struct timespec ts = { 0, 1 };
>>>       pselect(0, 0, 0, 0, &ts, &orig_mask);
>>>     }
>>> }
>>> pcc@pac-mini /tmp> time ./pselect
>>>
>>> ________________________________________________________
>>> Executed in  179.87 millis    fish           external
>>>      usr time   77.68 millis   57.00 micros   77.62 millis
>>>      sys time  101.37 millis  852.00 micros  100.52 millis
>>>
>>> Besides, all that you're really saving here is the single pselect
>>> call. There are no doubt more expensive syscalls involved in exiting
>>> and entering the VCPU that would dominate here.
>>
>> I would expect that such a super low ts value has a short-circuit path
>> in the kernel as well. Where things start to fall apart is when you're
>> at a threshold where rescheduling might be ok, but then you need to take
>> all of the additional task switch overhead into account. Try to adapt
>> your test code a bit:
>>
>> #include <signal.h>
>> #include <sys/select.h>
>>
>> int main() {
>>     sigset_t mask, orig_mask;
>>     pthread_sigmask(SIG_SETMASK, 0, &mask);
>>     sigaddset(&mask, SIGUSR1);
>>     pthread_sigmask(SIG_SETMASK, &mask, &orig_mask);
>>
>>     for (int i = 0; i != 10000; ++i) {
>> #define SCALE_MS 1000000
>>       struct timespec ts = { 0, SCALE_MS / 10 };
>>       pselect(0, 0, 0, 0, &ts, &orig_mask);
>>     }
>> }
>>
>>
>> % time ./pselect
>> ./pselect  0.00s user 0.01s system 1% cpu 1.282 total
>>
>> You're suddenly seeing 300µs overhead per pselect call then. When I
>> measured actual enter/exit times in QEMU, I saw much bigger differences
>> between "time I want to sleep for" and "time I did sleep" even when just
>> capturing the virtual time before and after the nanosleep/pselect call.
> Okay. So the alternative is that we spin on the CPU, either doing
> no-op VCPU entries/exits or something like:
>
> while (mach_absolute_time() < cval);


This won't catch events that arrive during that time, such as 
interrupts, right? I'd just declare the WFI as done and keep looping in 
and out of the guest for now.


> My intuition is we shouldn't try to subvert the OS scheduler like this
> unless it's proven to help with some real world metric since otherwise
> we're not being fair to the other processes on the CPU. With CPU
> intensive workloads I wouldn't expect these kinds of sleeps to happen
> very often if at all so if it's only microbenchmarks and so on that
> are affected then my inclination is not to do this for now.


The problem is that the VM's OS is expecting bare metal timer behavior 
usually. And that gives you much better granularities than what we can 
achieve with a virtualization layer on top. So I do feel strongly about 
leaving this bit in. In the workloads you describe above, you won't ever 
hit that branch anyway.

The workloads that benefit from logic like this are message passing 
ones. Check out this presentation from a KVM colleague of yours for details:

https://www.linux-kvm.org/images/a/ac/02x03-Davit_Matalack-KVM_Message_passing_Performance.pdf
   https://www.youtube.com/watch?v=p85FFrloLFg


Alex


Re: [PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Peter Collingbourne 3 years, 5 months ago
On Tue, Dec 1, 2020 at 5:53 PM Alexander Graf <agraf@csgraf.de> wrote:
>
>
> On 02.12.20 02:19, Peter Collingbourne wrote:
> > On Tue, Dec 1, 2020 at 2:04 PM Alexander Graf <agraf@csgraf.de> wrote:
> >>
> >> On 01.12.20 19:59, Peter Collingbourne wrote:
> >>> On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote:
> >>>> Hi Peter,
> >>>>
> >>>> On 01.12.20 09:21, Peter Collingbourne wrote:
> >>>>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
> >>>>> up on IPI.
> >>>>>
> >>>>> Signed-off-by: Peter Collingbourne <pcc@google.com>
> >>>> Thanks a bunch!
> >>>>
> >>>>
> >>>>> ---
> >>>>> Alexander Graf wrote:
> >>>>>> I would love to take a patch from you here :). I'll still be stuck for a
> >>>>>> while with the sysreg sync rework that Peter asked for before I can look
> >>>>>> at WFI again.
> >>>>> Okay, here's a patch :) It's a relatively straightforward adaptation
> >>>>> of what we have in our fork, which can now boot Android to GUI while
> >>>>> remaining at around 4% CPU when idle.
> >>>>>
> >>>>> I'm not set up to boot a full Linux distribution at the moment so I
> >>>>> tested it on upstream QEMU by running a recent mainline Linux kernel
> >>>>> with a rootfs containing an init program that just does sleep(5)
> >>>>> and verified that the qemu process remains at low CPU usage during
> >>>>> the sleep. This was on top of your v2 plus the last patch of your v1
> >>>>> since it doesn't look like you have a replacement for that logic yet.
> >>>>>
> >>>>>     accel/hvf/hvf-cpus.c     |  5 +--
> >>>>>     include/sysemu/hvf_int.h |  3 +-
> >>>>>     target/arm/hvf/hvf.c     | 94 +++++++++++-----------------------------
> >>>>>     3 files changed, 28 insertions(+), 74 deletions(-)
> >>>>>
> >>>>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
> >>>>> index 4360f64671..b2c8fb57f6 100644
> >>>>> --- a/accel/hvf/hvf-cpus.c
> >>>>> +++ b/accel/hvf/hvf-cpus.c
> >>>>> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
> >>>>>         sigact.sa_handler = dummy_signal;
> >>>>>         sigaction(SIG_IPI, &sigact, NULL);
> >>>>>
> >>>>> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
> >>>>> -    sigdelset(&set, SIG_IPI);
> >>>>> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
> >>>>> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
> >>>>> +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
> >>>> What will this do to the x86 hvf implementation? We're now not
> >>>> unblocking SIG_IPI again for that, right?
> >>> Yes and that was the case before your patch series.
> >>
> >> The way I understand Roman, he wanted to unblock the IPI signal on x86:
> >>
> >> https://patchwork.kernel.org/project/qemu-devel/patch/20201126215017.41156-3-agraf@csgraf.de/#23807021
> >>
> >> I agree that at this point it's not a problem though to break it again.
> >> I'm not quite sure how to merge your patches within my patch set though,
> >> given they basically revert half of my previously introduced code...
> >>
> >>
> >>>>>     #ifdef __aarch64__
> >>>>>         r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
> >>>>> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
> >>>>> index c56baa3ae8..13adf6ea77 100644
> >>>>> --- a/include/sysemu/hvf_int.h
> >>>>> +++ b/include/sysemu/hvf_int.h
> >>>>> @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
> >>>>>     struct hvf_vcpu_state {
> >>>>>         uint64_t fd;
> >>>>>         void *exit;
> >>>>> -    struct timespec ts;
> >>>>> -    bool sleeping;
> >>>>> +    sigset_t unblock_ipi_mask;
> >>>>>     };
> >>>>>
> >>>>>     void assert_hvf_ok(hv_return_t ret);
> >>>>> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
> >>>>> index 8fe10966d2..60a361ff38 100644
> >>>>> --- a/target/arm/hvf/hvf.c
> >>>>> +++ b/target/arm/hvf/hvf.c
> >>>>> @@ -2,6 +2,7 @@
> >>>>>      * QEMU Hypervisor.framework support for Apple Silicon
> >>>>>
> >>>>>      * Copyright 2020 Alexander Graf <agraf@csgraf.de>
> >>>>> + * Copyright 2020 Google LLC
> >>>>>      *
> >>>>>      * This work is licensed under the terms of the GNU GPL, version 2 or later.
> >>>>>      * See the COPYING file in the top-level directory.
> >>>>> @@ -18,6 +19,7 @@
> >>>>>     #include "sysemu/hw_accel.h"
> >>>>>
> >>>>>     #include <Hypervisor/Hypervisor.h>
> >>>>> +#include <mach/mach_time.h>
> >>>>>
> >>>>>     #include "exec/address-spaces.h"
> >>>>>     #include "hw/irq.h"
> >>>>> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
> >>>>>
> >>>>>     void hvf_kick_vcpu_thread(CPUState *cpu)
> >>>>>     {
> >>>>> -    if (cpu->hvf->sleeping) {
> >>>>> -        /*
> >>>>> -         * When sleeping, make sure we always send signals. Also, clear the
> >>>>> -         * timespec, so that an IPI that arrives between setting hvf->sleeping
> >>>>> -         * and the nanosleep syscall still aborts the sleep.
> >>>>> -         */
> >>>>> -        cpu->thread_kicked = false;
> >>>>> -        cpu->hvf->ts = (struct timespec){ };
> >>>>> -        cpus_kick_thread(cpu);
> >>>>> -    } else {
> >>>>> -        hv_vcpus_exit(&cpu->hvf->fd, 1);
> >>>>> -    }
> >>>>> +    cpus_kick_thread(cpu);
> >>>>> +    hv_vcpus_exit(&cpu->hvf->fd, 1);
> >>>> This means your first WFI will almost always return immediately due to a
> >>>> pending signal, because there probably was an IRQ pending before on the
> >>>> same CPU, no?
> >>> That's right. Any approach involving the "sleeping" field would need
> >>> to be implemented carefully to avoid races that may result in missed
> >>> wakeups so for simplicity I just decided to send both kinds of
> >>> wakeups. In particular the approach in the updated patch you sent is
> >>> racy and I'll elaborate more in the reply to that patch.
> >>>
> >>>>>     }
> >>>>>
> >>>>>     static int hvf_inject_interrupts(CPUState *cpu)
> >>>>> @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>>>             uint64_t syndrome = hvf_exit->exception.syndrome;
> >>>>>             uint32_t ec = syn_get_ec(syndrome);
> >>>>>
> >>>>> +        qemu_mutex_lock_iothread();
> >>>> Is there a particular reason you're moving the iothread lock out again
> >>>> from the individual bits? I would really like to keep a notion of fast
> >>>> path exits.
> >>> We still need to lock at least once no matter the exit reason to check
> >>> the interrupts so I don't think it's worth it to try and avoid locking
> >>> like this. It also makes the implementation easier to reason about and
> >>> therefore more likely to be correct. In our implementation we just
> >>> stay locked the whole time unless we're in hv_vcpu_run() or pselect().
> >>>
> >>>>>             switch (exit_reason) {
> >>>>>             case HV_EXIT_REASON_EXCEPTION:
> >>>>>                 /* This is the main one, handle below. */
> >>>>>                 break;
> >>>>>             case HV_EXIT_REASON_VTIMER_ACTIVATED:
> >>>>> -            qemu_mutex_lock_iothread();
> >>>>>                 current_cpu = cpu;
> >>>>>                 qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
> >>>>>                 qemu_mutex_unlock_iothread();
> >>>>>                 continue;
> >>>>>             case HV_EXIT_REASON_CANCELED:
> >>>>>                 /* we got kicked, no exit to process */
> >>>>> +            qemu_mutex_unlock_iothread();
> >>>>>                 continue;
> >>>>>             default:
> >>>>>                 assert(0);
> >>>>> @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>>>                 uint32_t srt = (syndrome >> 16) & 0x1f;
> >>>>>                 uint64_t val = 0;
> >>>>>
> >>>>> -            qemu_mutex_lock_iothread();
> >>>>>                 current_cpu = cpu;
> >>>>>
> >>>>>                 DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
> >>>>> @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>>>                     hvf_set_reg(cpu, srt, val);
> >>>>>                 }
> >>>>>
> >>>>> -            qemu_mutex_unlock_iothread();
> >>>>> -
> >>>>>                 advance_pc = true;
> >>>>>                 break;
> >>>>>             }
> >>>>> @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>>>             case EC_WFX_TRAP:
> >>>>>                 if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
> >>>>>                     (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> >>>>> -                uint64_t cval, ctl, val, diff, now;
> >>>>> +                uint64_t cval;
> >>>>>
> >>>>> -                /* Set up a local timer for vtimer if necessary ... */
> >>>>> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
> >>>>> -                assert_hvf_ok(r);
> >>>>>                     r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
> >>>>>                     assert_hvf_ok(r);
> >>>>>
> >>>>> -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
> >>>>> -                diff = cval - val;
> >>>>> -
> >>>>> -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
> >>>>> -                      gt_cntfrq_period_ns(arm_cpu);
> >>>>> -
> >>>>> -                /* Timer disabled or masked, just wait for long */
> >>>>> -                if (!(ctl & 1) || (ctl & 2)) {
> >>>>> -                    diff = (120 * NANOSECONDS_PER_SECOND) /
> >>>>> -                           gt_cntfrq_period_ns(arm_cpu);
> >>>>> +                int64_t ticks_to_sleep = cval - mach_absolute_time();
> >>>>> +                if (ticks_to_sleep < 0) {
> >>>>> +                    break;
> >>>> This will loop at 100% for Windows, which configures the vtimer as
> >>>> cval=0 ctl=7, so with IRQ mask bit set.
> >>> Okay, but the 120s is kind of arbitrary so we should just sleep until
> >>> we get a signal. That can be done by passing null as the timespec
> >>> argument to pselect().
> >>
> >> The reason I capped it at 120s was so that if I do hit a race, you don't
> >> break everything forever. Only for 2 minutes :).
> > I see. I think at this point we want to notice these types of bugs if
> > they exist instead of hiding them, so I would mildly be in favor of
> > not capping at 120s.
>
>
> Crossing my fingers that we are at that point already :).
>
>
> >
> >>>> Alex
> >>>>
> >>>>
> >>>>>                     }
> >>>>>
> >>>>> -                if (diff < INT64_MAX) {
> >>>>> -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
> >>>>> -                    struct timespec *ts = &cpu->hvf->ts;
> >>>>> -
> >>>>> -                    *ts = (struct timespec){
> >>>>> -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
> >>>>> -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
> >>>>> -                    };
> >>>>> -
> >>>>> -                    /*
> >>>>> -                     * Waking up easily takes 1ms, don't go to sleep for smaller
> >>>>> -                     * time periods than 2ms.
> >>>>> -                     */
> >>>>> -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
> >>>> I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to
> >>>> return. Without logic like this, super short WFIs will hurt performance
> >>>> quite badly.
> >>> I don't think that's accurate. According to this benchmark it's a few
> >>> hundred nanoseconds at most.
> >>>
> >>> pcc@pac-mini /tmp> cat pselect.c
> >>> #include <signal.h>
> >>> #include <sys/select.h>
> >>>
> >>> int main() {
> >>>     sigset_t mask, orig_mask;
> >>>     pthread_sigmask(SIG_SETMASK, 0, &mask);
> >>>     sigaddset(&mask, SIGUSR1);
> >>>     pthread_sigmask(SIG_SETMASK, &mask, &orig_mask);
> >>>
> >>>     for (int i = 0; i != 1000000; ++i) {
> >>>       struct timespec ts = { 0, 1 };
> >>>       pselect(0, 0, 0, 0, &ts, &orig_mask);
> >>>     }
> >>> }
> >>> pcc@pac-mini /tmp> time ./pselect
> >>>
> >>> ________________________________________________________
> >>> Executed in  179.87 millis    fish           external
> >>>      usr time   77.68 millis   57.00 micros   77.62 millis
> >>>      sys time  101.37 millis  852.00 micros  100.52 millis
> >>>
> >>> Besides, all that you're really saving here is the single pselect
> >>> call. There are no doubt more expensive syscalls involved in exiting
> >>> and entering the VCPU that would dominate here.
> >>
> >> I would expect that such a super low ts value has a short-circuit path
> >> in the kernel as well. Where things start to fall apart is when you're
> >> at a threshold where rescheduling might be ok, but then you need to take
> >> all of the additional task switch overhead into account. Try to adapt
> >> your test code a bit:
> >>
> >> #include <signal.h>
> >> #include <sys/select.h>
> >>
> >> int main() {
> >>     sigset_t mask, orig_mask;
> >>     pthread_sigmask(SIG_SETMASK, 0, &mask);
> >>     sigaddset(&mask, SIGUSR1);
> >>     pthread_sigmask(SIG_SETMASK, &mask, &orig_mask);
> >>
> >>     for (int i = 0; i != 10000; ++i) {
> >> #define SCALE_MS 1000000
> >>       struct timespec ts = { 0, SCALE_MS / 10 };
> >>       pselect(0, 0, 0, 0, &ts, &orig_mask);
> >>     }
> >> }
> >>
> >>
> >> % time ./pselect
> >> ./pselect  0.00s user 0.01s system 1% cpu 1.282 total
> >>
> >> You're suddenly seeing 300µs overhead per pselect call then. When I
> >> measured actual enter/exit times in QEMU, I saw much bigger differences
> >> between "time I want to sleep for" and "time I did sleep" even when just
> >> capturing the virtual time before and after the nanosleep/pselect call.
> > Okay. So the alternative is that we spin on the CPU, either doing
> > no-op VCPU entries/exits or something like:
> >
> > while (mach_absolute_time() < cval);
>
>
> This won't catch events that arrive during that time, such as
> interrupts, right? I'd just declare the WFI as done and keep looping in
> and out of the guest for now.

Oh, that's a good point.

> > My intuition is we shouldn't try to subvert the OS scheduler like this
> > unless it's proven to help with some real world metric since otherwise
> > we're not being fair to the other processes on the CPU. With CPU
> > intensive workloads I wouldn't expect these kinds of sleeps to happen
> > very often if at all so if it's only microbenchmarks and so on that
> > are affected then my inclination is not to do this for now.
>
>
> The problem is that the VM's OS is expecting bare metal timer behavior
> usually. And that gives you much better granularities than what we can
> achieve with a virtualization layer on top. So I do feel strongly about
> leaving this bit in. In the workloads you describe above, you won't ever
> hit that branch anyway.
>
> The workloads that benefit from logic like this are message passing
> ones. Check out this presentation from a KVM colleague of yours for details:
>
> https://www.linux-kvm.org/images/a/ac/02x03-Davit_Matalack-KVM_Message_passing_Performance.pdf
>    https://www.youtube.com/watch?v=p85FFrloLFg

Mm, okay. I personally would not add anything like that at this point
without real-world data but I don't feel too strongly and I suppose
the implementation can always be adjusted later.

Peter

Re: [PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Roman Bolshakov 3 years, 5 months ago
On Tue, Dec 01, 2020 at 10:59:50AM -0800, Peter Collingbourne wrote:
> On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote:
> >
> > Hi Peter,
> >
> > On 01.12.20 09:21, Peter Collingbourne wrote:
> > > Sleep on WFx until the VTIMER is due but allow ourselves to be woken
> > > up on IPI.
> > >
> > > Signed-off-by: Peter Collingbourne <pcc@google.com>
> >
> >
> > Thanks a bunch!
> >
> >
> > > ---
> > > Alexander Graf wrote:
> > >> I would love to take a patch from you here :). I'll still be stuck for a
> > >> while with the sysreg sync rework that Peter asked for before I can look
> > >> at WFI again.
> > > Okay, here's a patch :) It's a relatively straightforward adaptation
> > > of what we have in our fork, which can now boot Android to GUI while
> > > remaining at around 4% CPU when idle.
> > >
> > > I'm not set up to boot a full Linux distribution at the moment so I
> > > tested it on upstream QEMU by running a recent mainline Linux kernel
> > > with a rootfs containing an init program that just does sleep(5)
> > > and verified that the qemu process remains at low CPU usage during
> > > the sleep. This was on top of your v2 plus the last patch of your v1
> > > since it doesn't look like you have a replacement for that logic yet.
> > >
> > >   accel/hvf/hvf-cpus.c     |  5 +--
> > >   include/sysemu/hvf_int.h |  3 +-
> > >   target/arm/hvf/hvf.c     | 94 +++++++++++-----------------------------
> > >   3 files changed, 28 insertions(+), 74 deletions(-)
> > >
> > > diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
> > > index 4360f64671..b2c8fb57f6 100644
> > > --- a/accel/hvf/hvf-cpus.c
> > > +++ b/accel/hvf/hvf-cpus.c
> > > @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
> > >       sigact.sa_handler = dummy_signal;
> > >       sigaction(SIG_IPI, &sigact, NULL);
> > >
> > > -    pthread_sigmask(SIG_BLOCK, NULL, &set);
> > > -    sigdelset(&set, SIG_IPI);
> > > -    pthread_sigmask(SIG_SETMASK, &set, NULL);
> > > +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
> > > +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
> >
> >
> > What will this do to the x86 hvf implementation? We're now not
> > unblocking SIG_IPI again for that, right?
> 
> Yes and that was the case before your patch series.
> 
> > >
> > >   #ifdef __aarch64__
> > >       r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
> > > diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
> > > index c56baa3ae8..13adf6ea77 100644
> > > --- a/include/sysemu/hvf_int.h
> > > +++ b/include/sysemu/hvf_int.h
> > > @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
> > >   struct hvf_vcpu_state {
> > >       uint64_t fd;
> > >       void *exit;
> > > -    struct timespec ts;
> > > -    bool sleeping;
> > > +    sigset_t unblock_ipi_mask;
> > >   };
> > >
> > >   void assert_hvf_ok(hv_return_t ret);
> > > diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
> > > index 8fe10966d2..60a361ff38 100644
> > > --- a/target/arm/hvf/hvf.c
> > > +++ b/target/arm/hvf/hvf.c
> > > @@ -2,6 +2,7 @@
> > >    * QEMU Hypervisor.framework support for Apple Silicon
> > >
> > >    * Copyright 2020 Alexander Graf <agraf@csgraf.de>
> > > + * Copyright 2020 Google LLC
> > >    *
> > >    * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > >    * See the COPYING file in the top-level directory.
> > > @@ -18,6 +19,7 @@
> > >   #include "sysemu/hw_accel.h"
> > >
> > >   #include <Hypervisor/Hypervisor.h>
> > > +#include <mach/mach_time.h>
> > >
> > >   #include "exec/address-spaces.h"
> > >   #include "hw/irq.h"
> > > @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
> > >
> > >   void hvf_kick_vcpu_thread(CPUState *cpu)
> > >   {
> > > -    if (cpu->hvf->sleeping) {
> > > -        /*
> > > -         * When sleeping, make sure we always send signals. Also, clear the
> > > -         * timespec, so that an IPI that arrives between setting hvf->sleeping
> > > -         * and the nanosleep syscall still aborts the sleep.
> > > -         */
> > > -        cpu->thread_kicked = false;
> > > -        cpu->hvf->ts = (struct timespec){ };
> > > -        cpus_kick_thread(cpu);
> > > -    } else {
> > > -        hv_vcpus_exit(&cpu->hvf->fd, 1);
> > > -    }
> > > +    cpus_kick_thread(cpu);
> > > +    hv_vcpus_exit(&cpu->hvf->fd, 1);
> >
> >
> > This means your first WFI will almost always return immediately due to a
> > pending signal, because there probably was an IRQ pending before on the
> > same CPU, no?
> 
> That's right. Any approach involving the "sleeping" field would need
> to be implemented carefully to avoid races that may result in missed
> wakeups so for simplicity I just decided to send both kinds of
> wakeups. In particular the approach in the updated patch you sent is
> racy and I'll elaborate more in the reply to that patch.
> 
> > >   }
> > >
> > >   static int hvf_inject_interrupts(CPUState *cpu)
> > > @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu)
> > >           uint64_t syndrome = hvf_exit->exception.syndrome;
> > >           uint32_t ec = syn_get_ec(syndrome);
> > >
> > > +        qemu_mutex_lock_iothread();
> >
> >
> > Is there a particular reason you're moving the iothread lock out again
> > from the individual bits? I would really like to keep a notion of fast
> > path exits.
> 
> We still need to lock at least once no matter the exit reason to check
> the interrupts so I don't think it's worth it to try and avoid locking
> like this. It also makes the implementation easier to reason about and
> therefore more likely to be correct. In our implementation we just
> stay locked the whole time unless we're in hv_vcpu_run() or pselect().
> 

But does it leaves a small window for a kick loss between
qemu_mutex_unlock_iothread() and hv_vcpu_run()/pselect()?

For x86 it could lose a kick between them. That was a reason for the
sophisticated approach to catch the kick [1] (and related discussions in
v1/v2/v3).  Unfortunately I can't read ARM assembly yet so I don't if
hv_vcpus_exit() suffers from the same issue as x86 hv_vcpu_interrupt().

1. https://patchwork.kernel.org/project/qemu-devel/patch/20200729124832.79375-1-r.bolshakov@yadro.com/

Thanks,
Roman

> > >           switch (exit_reason) {
> > >           case HV_EXIT_REASON_EXCEPTION:
> > >               /* This is the main one, handle below. */
> > >               break;
> > >           case HV_EXIT_REASON_VTIMER_ACTIVATED:
> > > -            qemu_mutex_lock_iothread();
> > >               current_cpu = cpu;
> > >               qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
> > >               qemu_mutex_unlock_iothread();
> > >               continue;
> > >           case HV_EXIT_REASON_CANCELED:
> > >               /* we got kicked, no exit to process */
> > > +            qemu_mutex_unlock_iothread();
> > >               continue;
> > >           default:
> > >               assert(0);
> > > @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> > >               uint32_t srt = (syndrome >> 16) & 0x1f;
> > >               uint64_t val = 0;
> > >
> > > -            qemu_mutex_lock_iothread();
> > >               current_cpu = cpu;
> > >
> > >               DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
> > > @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> > >                   hvf_set_reg(cpu, srt, val);
> > >               }
> > >
> > > -            qemu_mutex_unlock_iothread();
> > > -
> > >               advance_pc = true;
> > >               break;
> > >           }
> > > @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu)
> > >           case EC_WFX_TRAP:
> > >               if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
> > >                   (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> > > -                uint64_t cval, ctl, val, diff, now;
> > > +                uint64_t cval;
> > >
> > > -                /* Set up a local timer for vtimer if necessary ... */
> > > -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
> > > -                assert_hvf_ok(r);
> > >                   r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
> > >                   assert_hvf_ok(r);
> > >
> > > -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
> > > -                diff = cval - val;
> > > -
> > > -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
> > > -                      gt_cntfrq_period_ns(arm_cpu);
> > > -
> > > -                /* Timer disabled or masked, just wait for long */
> > > -                if (!(ctl & 1) || (ctl & 2)) {
> > > -                    diff = (120 * NANOSECONDS_PER_SECOND) /
> > > -                           gt_cntfrq_period_ns(arm_cpu);
> > > +                int64_t ticks_to_sleep = cval - mach_absolute_time();
> > > +                if (ticks_to_sleep < 0) {
> > > +                    break;
> >
> >
> > This will loop at 100% for Windows, which configures the vtimer as
> > cval=0 ctl=7, so with IRQ mask bit set.
> 
> Okay, but the 120s is kind of arbitrary so we should just sleep until
> we get a signal. That can be done by passing null as the timespec
> argument to pselect().
> 
> >
> >
> > Alex
> >
> >
> > >                   }
> > >
> > > -                if (diff < INT64_MAX) {
> > > -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
> > > -                    struct timespec *ts = &cpu->hvf->ts;
> > > -
> > > -                    *ts = (struct timespec){
> > > -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
> > > -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
> > > -                    };
> > > -
> > > -                    /*
> > > -                     * Waking up easily takes 1ms, don't go to sleep for smaller
> > > -                     * time periods than 2ms.
> > > -                     */
> > > -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
> >
> >
> > I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to
> > return. Without logic like this, super short WFIs will hurt performance
> > quite badly.
> 
> I don't think that's accurate. According to this benchmark it's a few
> hundred nanoseconds at most.
> 
> pcc@pac-mini /tmp> cat pselect.c
> #include <signal.h>
> #include <sys/select.h>
> 
> int main() {
>   sigset_t mask, orig_mask;
>   pthread_sigmask(SIG_SETMASK, 0, &mask);
>   sigaddset(&mask, SIGUSR1);
>   pthread_sigmask(SIG_SETMASK, &mask, &orig_mask);
> 
>   for (int i = 0; i != 1000000; ++i) {
>     struct timespec ts = { 0, 1 };
>     pselect(0, 0, 0, 0, &ts, &orig_mask);
>   }
> }
> pcc@pac-mini /tmp> time ./pselect
> 
> ________________________________________________________
> Executed in  179.87 millis    fish           external
>    usr time   77.68 millis   57.00 micros   77.62 millis
>    sys time  101.37 millis  852.00 micros  100.52 millis
> 
> Besides, all that you're really saving here is the single pselect
> call. There are no doubt more expensive syscalls involved in exiting
> and entering the VCPU that would dominate here.
> 
> Peter
> 
> >
> >
> > Alex
> >
> > > -                        advance_pc = true;
> > > -                        break;
> > > -                    }
> > > -
> > > -                    /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */
> > > -                    cpu->hvf->sleeping = true;
> > > -                    smp_mb();
> > > -
> > > -                    /* Bail out if we received an IRQ meanwhile */
> > > -                    if (cpu->thread_kicked || (cpu->interrupt_request &
> > > -                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> > > -                        cpu->hvf->sleeping = false;
> > > -                        break;
> > > -                    }
> > > -
> > > -                    /* nanosleep returns on signal, so we wake up on kick. */
> > > -                    nanosleep(ts, NULL);
> > > -
> > > -                    /* Out of sleep - either naturally or because of a kick */
> > > -                    cpu->hvf->sleeping = false;
> > > -                }
> > > +                uint64_t seconds = ticks_to_sleep / arm_cpu->gt_cntfrq_hz;
> > > +                uint64_t nanos =
> > > +                    (ticks_to_sleep - arm_cpu->gt_cntfrq_hz * seconds) *
> > > +                    1000000000 / arm_cpu->gt_cntfrq_hz;
> > > +                struct timespec ts = { seconds, nanos };
> > > +
> > > +                /*
> > > +                 * Use pselect to sleep so that other threads can IPI us while
> > > +                 * we're sleeping.
> > > +                 */
> > > +                qatomic_mb_set(&cpu->thread_kicked, false);
> > > +                qemu_mutex_unlock_iothread();
> > > +                pselect(0, 0, 0, 0, &ts, &cpu->hvf->unblock_ipi_mask);
> > > +                qemu_mutex_lock_iothread();
> > >
> > >                   advance_pc = true;
> > >               }
> > >               break;
> > >           case EC_AA64_HVC:
> > >               cpu_synchronize_state(cpu);
> > > -            qemu_mutex_lock_iothread();
> > >               current_cpu = cpu;
> > >               if (arm_is_psci_call(arm_cpu, EXCP_HVC)) {
> > >                   arm_handle_psci_call(arm_cpu);
> > > @@ -562,11 +520,9 @@ int hvf_vcpu_exec(CPUState *cpu)
> > >                   DPRINTF("unknown HVC! %016llx", env->xregs[0]);
> > >                   env->xregs[0] = -1;
> > >               }
> > > -            qemu_mutex_unlock_iothread();
> > >               break;
> > >           case EC_AA64_SMC:
> > >               cpu_synchronize_state(cpu);
> > > -            qemu_mutex_lock_iothread();
> > >               current_cpu = cpu;
> > >               if (arm_is_psci_call(arm_cpu, EXCP_SMC)) {
> > >                   arm_handle_psci_call(arm_cpu);
> > > @@ -575,7 +531,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> > >                   env->xregs[0] = -1;
> > >                   env->pc += 4;
> > >               }
> > > -            qemu_mutex_unlock_iothread();
> > >               break;
> > >           default:
> > >               cpu_synchronize_state(cpu);
> > > @@ -594,6 +549,7 @@ int hvf_vcpu_exec(CPUState *cpu)
> > >               r = hv_vcpu_set_reg(cpu->hvf->fd, HV_REG_PC, pc);
> > >               assert_hvf_ok(r);
> > >           }
> > > +        qemu_mutex_unlock_iothread();
> > >       } while (ret == 0);
> > >
> > >       qemu_mutex_lock_iothread();

Re: [PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Peter Collingbourne 3 years, 5 months ago
On Thu, Dec 3, 2020 at 2:12 AM Roman Bolshakov <r.bolshakov@yadro.com> wrote:
>
> On Tue, Dec 01, 2020 at 10:59:50AM -0800, Peter Collingbourne wrote:
> > On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote:
> > >
> > > Hi Peter,
> > >
> > > On 01.12.20 09:21, Peter Collingbourne wrote:
> > > > Sleep on WFx until the VTIMER is due but allow ourselves to be woken
> > > > up on IPI.
> > > >
> > > > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > >
> > >
> > > Thanks a bunch!
> > >
> > >
> > > > ---
> > > > Alexander Graf wrote:
> > > >> I would love to take a patch from you here :). I'll still be stuck for a
> > > >> while with the sysreg sync rework that Peter asked for before I can look
> > > >> at WFI again.
> > > > Okay, here's a patch :) It's a relatively straightforward adaptation
> > > > of what we have in our fork, which can now boot Android to GUI while
> > > > remaining at around 4% CPU when idle.
> > > >
> > > > I'm not set up to boot a full Linux distribution at the moment so I
> > > > tested it on upstream QEMU by running a recent mainline Linux kernel
> > > > with a rootfs containing an init program that just does sleep(5)
> > > > and verified that the qemu process remains at low CPU usage during
> > > > the sleep. This was on top of your v2 plus the last patch of your v1
> > > > since it doesn't look like you have a replacement for that logic yet.
> > > >
> > > >   accel/hvf/hvf-cpus.c     |  5 +--
> > > >   include/sysemu/hvf_int.h |  3 +-
> > > >   target/arm/hvf/hvf.c     | 94 +++++++++++-----------------------------
> > > >   3 files changed, 28 insertions(+), 74 deletions(-)
> > > >
> > > > diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
> > > > index 4360f64671..b2c8fb57f6 100644
> > > > --- a/accel/hvf/hvf-cpus.c
> > > > +++ b/accel/hvf/hvf-cpus.c
> > > > @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
> > > >       sigact.sa_handler = dummy_signal;
> > > >       sigaction(SIG_IPI, &sigact, NULL);
> > > >
> > > > -    pthread_sigmask(SIG_BLOCK, NULL, &set);
> > > > -    sigdelset(&set, SIG_IPI);
> > > > -    pthread_sigmask(SIG_SETMASK, &set, NULL);
> > > > +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
> > > > +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
> > >
> > >
> > > What will this do to the x86 hvf implementation? We're now not
> > > unblocking SIG_IPI again for that, right?
> >
> > Yes and that was the case before your patch series.
> >
> > > >
> > > >   #ifdef __aarch64__
> > > >       r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
> > > > diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
> > > > index c56baa3ae8..13adf6ea77 100644
> > > > --- a/include/sysemu/hvf_int.h
> > > > +++ b/include/sysemu/hvf_int.h
> > > > @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
> > > >   struct hvf_vcpu_state {
> > > >       uint64_t fd;
> > > >       void *exit;
> > > > -    struct timespec ts;
> > > > -    bool sleeping;
> > > > +    sigset_t unblock_ipi_mask;
> > > >   };
> > > >
> > > >   void assert_hvf_ok(hv_return_t ret);
> > > > diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
> > > > index 8fe10966d2..60a361ff38 100644
> > > > --- a/target/arm/hvf/hvf.c
> > > > +++ b/target/arm/hvf/hvf.c
> > > > @@ -2,6 +2,7 @@
> > > >    * QEMU Hypervisor.framework support for Apple Silicon
> > > >
> > > >    * Copyright 2020 Alexander Graf <agraf@csgraf.de>
> > > > + * Copyright 2020 Google LLC
> > > >    *
> > > >    * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > > >    * See the COPYING file in the top-level directory.
> > > > @@ -18,6 +19,7 @@
> > > >   #include "sysemu/hw_accel.h"
> > > >
> > > >   #include <Hypervisor/Hypervisor.h>
> > > > +#include <mach/mach_time.h>
> > > >
> > > >   #include "exec/address-spaces.h"
> > > >   #include "hw/irq.h"
> > > > @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
> > > >
> > > >   void hvf_kick_vcpu_thread(CPUState *cpu)
> > > >   {
> > > > -    if (cpu->hvf->sleeping) {
> > > > -        /*
> > > > -         * When sleeping, make sure we always send signals. Also, clear the
> > > > -         * timespec, so that an IPI that arrives between setting hvf->sleeping
> > > > -         * and the nanosleep syscall still aborts the sleep.
> > > > -         */
> > > > -        cpu->thread_kicked = false;
> > > > -        cpu->hvf->ts = (struct timespec){ };
> > > > -        cpus_kick_thread(cpu);
> > > > -    } else {
> > > > -        hv_vcpus_exit(&cpu->hvf->fd, 1);
> > > > -    }
> > > > +    cpus_kick_thread(cpu);
> > > > +    hv_vcpus_exit(&cpu->hvf->fd, 1);
> > >
> > >
> > > This means your first WFI will almost always return immediately due to a
> > > pending signal, because there probably was an IRQ pending before on the
> > > same CPU, no?
> >
> > That's right. Any approach involving the "sleeping" field would need
> > to be implemented carefully to avoid races that may result in missed
> > wakeups so for simplicity I just decided to send both kinds of
> > wakeups. In particular the approach in the updated patch you sent is
> > racy and I'll elaborate more in the reply to that patch.
> >
> > > >   }
> > > >
> > > >   static int hvf_inject_interrupts(CPUState *cpu)
> > > > @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu)
> > > >           uint64_t syndrome = hvf_exit->exception.syndrome;
> > > >           uint32_t ec = syn_get_ec(syndrome);
> > > >
> > > > +        qemu_mutex_lock_iothread();
> > >
> > >
> > > Is there a particular reason you're moving the iothread lock out again
> > > from the individual bits? I would really like to keep a notion of fast
> > > path exits.
> >
> > We still need to lock at least once no matter the exit reason to check
> > the interrupts so I don't think it's worth it to try and avoid locking
> > like this. It also makes the implementation easier to reason about and
> > therefore more likely to be correct. In our implementation we just
> > stay locked the whole time unless we're in hv_vcpu_run() or pselect().
> >
>
> But does it leaves a small window for a kick loss between
> qemu_mutex_unlock_iothread() and hv_vcpu_run()/pselect()?
>
> For x86 it could lose a kick between them. That was a reason for the
> sophisticated approach to catch the kick [1] (and related discussions in
> v1/v2/v3).  Unfortunately I can't read ARM assembly yet so I don't if
> hv_vcpus_exit() suffers from the same issue as x86 hv_vcpu_interrupt().
>
> 1. https://patchwork.kernel.org/project/qemu-devel/patch/20200729124832.79375-1-r.bolshakov@yadro.com/

I addressed pselect() in my other reply.

It isn't on the website but the hv_vcpu.h header says this about
hv_vcpus_exit():

 * @discussion
 *             If a vcpu is not running, the next time hv_vcpu_run is
called for the corresponding
 *             vcpu, it will return immediately without entering the guest.

So at least as documented I think we are okay.

Peter

>
> Thanks,
> Roman
>
> > > >           switch (exit_reason) {
> > > >           case HV_EXIT_REASON_EXCEPTION:
> > > >               /* This is the main one, handle below. */
> > > >               break;
> > > >           case HV_EXIT_REASON_VTIMER_ACTIVATED:
> > > > -            qemu_mutex_lock_iothread();
> > > >               current_cpu = cpu;
> > > >               qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
> > > >               qemu_mutex_unlock_iothread();
> > > >               continue;
> > > >           case HV_EXIT_REASON_CANCELED:
> > > >               /* we got kicked, no exit to process */
> > > > +            qemu_mutex_unlock_iothread();
> > > >               continue;
> > > >           default:
> > > >               assert(0);
> > > > @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> > > >               uint32_t srt = (syndrome >> 16) & 0x1f;
> > > >               uint64_t val = 0;
> > > >
> > > > -            qemu_mutex_lock_iothread();
> > > >               current_cpu = cpu;
> > > >
> > > >               DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
> > > > @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> > > >                   hvf_set_reg(cpu, srt, val);
> > > >               }
> > > >
> > > > -            qemu_mutex_unlock_iothread();
> > > > -
> > > >               advance_pc = true;
> > > >               break;
> > > >           }
> > > > @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu)
> > > >           case EC_WFX_TRAP:
> > > >               if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
> > > >                   (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> > > > -                uint64_t cval, ctl, val, diff, now;
> > > > +                uint64_t cval;
> > > >
> > > > -                /* Set up a local timer for vtimer if necessary ... */
> > > > -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
> > > > -                assert_hvf_ok(r);
> > > >                   r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
> > > >                   assert_hvf_ok(r);
> > > >
> > > > -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
> > > > -                diff = cval - val;
> > > > -
> > > > -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
> > > > -                      gt_cntfrq_period_ns(arm_cpu);
> > > > -
> > > > -                /* Timer disabled or masked, just wait for long */
> > > > -                if (!(ctl & 1) || (ctl & 2)) {
> > > > -                    diff = (120 * NANOSECONDS_PER_SECOND) /
> > > > -                           gt_cntfrq_period_ns(arm_cpu);
> > > > +                int64_t ticks_to_sleep = cval - mach_absolute_time();
> > > > +                if (ticks_to_sleep < 0) {
> > > > +                    break;
> > >
> > >
> > > This will loop at 100% for Windows, which configures the vtimer as
> > > cval=0 ctl=7, so with IRQ mask bit set.
> >
> > Okay, but the 120s is kind of arbitrary so we should just sleep until
> > we get a signal. That can be done by passing null as the timespec
> > argument to pselect().
> >
> > >
> > >
> > > Alex
> > >
> > >
> > > >                   }
> > > >
> > > > -                if (diff < INT64_MAX) {
> > > > -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
> > > > -                    struct timespec *ts = &cpu->hvf->ts;
> > > > -
> > > > -                    *ts = (struct timespec){
> > > > -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
> > > > -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
> > > > -                    };
> > > > -
> > > > -                    /*
> > > > -                     * Waking up easily takes 1ms, don't go to sleep for smaller
> > > > -                     * time periods than 2ms.
> > > > -                     */
> > > > -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
> > >
> > >
> > > I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to
> > > return. Without logic like this, super short WFIs will hurt performance
> > > quite badly.
> >
> > I don't think that's accurate. According to this benchmark it's a few
> > hundred nanoseconds at most.
> >
> > pcc@pac-mini /tmp> cat pselect.c
> > #include <signal.h>
> > #include <sys/select.h>
> >
> > int main() {
> >   sigset_t mask, orig_mask;
> >   pthread_sigmask(SIG_SETMASK, 0, &mask);
> >   sigaddset(&mask, SIGUSR1);
> >   pthread_sigmask(SIG_SETMASK, &mask, &orig_mask);
> >
> >   for (int i = 0; i != 1000000; ++i) {
> >     struct timespec ts = { 0, 1 };
> >     pselect(0, 0, 0, 0, &ts, &orig_mask);
> >   }
> > }
> > pcc@pac-mini /tmp> time ./pselect
> >
> > ________________________________________________________
> > Executed in  179.87 millis    fish           external
> >    usr time   77.68 millis   57.00 micros   77.62 millis
> >    sys time  101.37 millis  852.00 micros  100.52 millis
> >
> > Besides, all that you're really saving here is the single pselect
> > call. There are no doubt more expensive syscalls involved in exiting
> > and entering the VCPU that would dominate here.
> >
> > Peter
> >
> > >
> > >
> > > Alex
> > >
> > > > -                        advance_pc = true;
> > > > -                        break;
> > > > -                    }
> > > > -
> > > > -                    /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */
> > > > -                    cpu->hvf->sleeping = true;
> > > > -                    smp_mb();
> > > > -
> > > > -                    /* Bail out if we received an IRQ meanwhile */
> > > > -                    if (cpu->thread_kicked || (cpu->interrupt_request &
> > > > -                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> > > > -                        cpu->hvf->sleeping = false;
> > > > -                        break;
> > > > -                    }
> > > > -
> > > > -                    /* nanosleep returns on signal, so we wake up on kick. */
> > > > -                    nanosleep(ts, NULL);
> > > > -
> > > > -                    /* Out of sleep - either naturally or because of a kick */
> > > > -                    cpu->hvf->sleeping = false;
> > > > -                }
> > > > +                uint64_t seconds = ticks_to_sleep / arm_cpu->gt_cntfrq_hz;
> > > > +                uint64_t nanos =
> > > > +                    (ticks_to_sleep - arm_cpu->gt_cntfrq_hz * seconds) *
> > > > +                    1000000000 / arm_cpu->gt_cntfrq_hz;
> > > > +                struct timespec ts = { seconds, nanos };
> > > > +
> > > > +                /*
> > > > +                 * Use pselect to sleep so that other threads can IPI us while
> > > > +                 * we're sleeping.
> > > > +                 */
> > > > +                qatomic_mb_set(&cpu->thread_kicked, false);
> > > > +                qemu_mutex_unlock_iothread();
> > > > +                pselect(0, 0, 0, 0, &ts, &cpu->hvf->unblock_ipi_mask);
> > > > +                qemu_mutex_lock_iothread();
> > > >
> > > >                   advance_pc = true;
> > > >               }
> > > >               break;
> > > >           case EC_AA64_HVC:
> > > >               cpu_synchronize_state(cpu);
> > > > -            qemu_mutex_lock_iothread();
> > > >               current_cpu = cpu;
> > > >               if (arm_is_psci_call(arm_cpu, EXCP_HVC)) {
> > > >                   arm_handle_psci_call(arm_cpu);
> > > > @@ -562,11 +520,9 @@ int hvf_vcpu_exec(CPUState *cpu)
> > > >                   DPRINTF("unknown HVC! %016llx", env->xregs[0]);
> > > >                   env->xregs[0] = -1;
> > > >               }
> > > > -            qemu_mutex_unlock_iothread();
> > > >               break;
> > > >           case EC_AA64_SMC:
> > > >               cpu_synchronize_state(cpu);
> > > > -            qemu_mutex_lock_iothread();
> > > >               current_cpu = cpu;
> > > >               if (arm_is_psci_call(arm_cpu, EXCP_SMC)) {
> > > >                   arm_handle_psci_call(arm_cpu);
> > > > @@ -575,7 +531,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> > > >                   env->xregs[0] = -1;
> > > >                   env->pc += 4;
> > > >               }
> > > > -            qemu_mutex_unlock_iothread();
> > > >               break;
> > > >           default:
> > > >               cpu_synchronize_state(cpu);
> > > > @@ -594,6 +549,7 @@ int hvf_vcpu_exec(CPUState *cpu)
> > > >               r = hv_vcpu_set_reg(cpu->hvf->fd, HV_REG_PC, pc);
> > > >               assert_hvf_ok(r);
> > > >           }
> > > > +        qemu_mutex_unlock_iothread();
> > > >       } while (ret == 0);
> > > >
> > > >       qemu_mutex_lock_iothread();

Re: [PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Alexander Graf 3 years, 5 months ago
On 01.12.20 09:21, Peter Collingbourne wrote:
> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
> up on IPI.
>
> Signed-off-by: Peter Collingbourne <pcc@google.com>
> ---
> Alexander Graf wrote:
>> I would love to take a patch from you here :). I'll still be stuck for a
>> while with the sysreg sync rework that Peter asked for before I can look
>> at WFI again.
> Okay, here's a patch :) It's a relatively straightforward adaptation
> of what we have in our fork, which can now boot Android to GUI while
> remaining at around 4% CPU when idle.
>
> I'm not set up to boot a full Linux distribution at the moment so I
> tested it on upstream QEMU by running a recent mainline Linux kernel
> with a rootfs containing an init program that just does sleep(5)
> and verified that the qemu process remains at low CPU usage during
> the sleep. This was on top of your v2 plus the last patch of your v1
> since it doesn't look like you have a replacement for that logic yet.


How about something like this instead?


Alex


diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
index 4360f64671..50384013ea 100644
--- a/accel/hvf/hvf-cpus.c
+++ b/accel/hvf/hvf-cpus.c
@@ -337,16 +337,18 @@ static int hvf_init_vcpu(CPUState *cpu)
      cpu->hvf = g_malloc0(sizeof(*cpu->hvf));

      /* init cpu signals */
-    sigset_t set;
      struct sigaction sigact;

      memset(&sigact, 0, sizeof(sigact));
      sigact.sa_handler = dummy_signal;
      sigaction(SIG_IPI, &sigact, NULL);

-    pthread_sigmask(SIG_BLOCK, NULL, &set);
-    sigdelset(&set, SIG_IPI);
-    pthread_sigmask(SIG_SETMASK, &set, NULL);
+    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask);
+    sigdelset(&cpu->hvf->sigmask, SIG_IPI);
+    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL);
+
+    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask_ipi);
+    sigaddset(&cpu->hvf->sigmask_ipi, SIG_IPI);

  #ifdef __aarch64__
      r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t 
**)&cpu->hvf->exit, NULL);
diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
index c56baa3ae8..6e237f2db0 100644
--- a/include/sysemu/hvf_int.h
+++ b/include/sysemu/hvf_int.h
@@ -62,8 +62,9 @@ extern HVFState *hvf_state;
  struct hvf_vcpu_state {
      uint64_t fd;
      void *exit;
-    struct timespec ts;
      bool sleeping;
+    sigset_t sigmask;
+    sigset_t sigmask_ipi;
  };

  void assert_hvf_ok(hv_return_t ret);
diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
index 0c01a03725..350b845e6e 100644
--- a/target/arm/hvf/hvf.c
+++ b/target/arm/hvf/hvf.c
@@ -320,20 +320,24 @@ int hvf_arch_init_vcpu(CPUState *cpu)

  void hvf_kick_vcpu_thread(CPUState *cpu)
  {
-    if (cpu->hvf->sleeping) {
-        /*
-         * When sleeping, make sure we always send signals. Also, clear the
-         * timespec, so that an IPI that arrives between setting 
hvf->sleeping
-         * and the nanosleep syscall still aborts the sleep.
-         */
-        cpu->thread_kicked = false;
-        cpu->hvf->ts = (struct timespec){ };
+    if (qatomic_read(&cpu->hvf->sleeping)) {
+        /* When sleeping, send a signal to get out of pselect */
          cpus_kick_thread(cpu);
      } else {
          hv_vcpus_exit(&cpu->hvf->fd, 1);
      }
  }

+static void hvf_block_sig_ipi(CPUState *cpu)
+{
+    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask_ipi, NULL);
+}
+
+static void hvf_unblock_sig_ipi(CPUState *cpu)
+{
+    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL);
+}
+
  static int hvf_inject_interrupts(CPUState *cpu)
  {
      if (cpu->interrupt_request & CPU_INTERRUPT_FIQ) {
@@ -354,6 +358,7 @@ int hvf_vcpu_exec(CPUState *cpu)
      ARMCPU *arm_cpu = ARM_CPU(cpu);
      CPUARMState *env = &arm_cpu->env;
      hv_vcpu_exit_t *hvf_exit = cpu->hvf->exit;
+    const uint32_t irq_mask = CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ;
      hv_return_t r;
      int ret = 0;

@@ -491,8 +496,8 @@ int hvf_vcpu_exec(CPUState *cpu)
              break;
          }
          case EC_WFX_TRAP:
-            if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
-                (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
+            if (!(syndrome & WFX_IS_WFE) &&
+                !(cpu->interrupt_request & irq_mask)) {
                  uint64_t cval, ctl, val, diff, now;

                  /* Set up a local timer for vtimer if necessary ... */
@@ -515,9 +520,7 @@ int hvf_vcpu_exec(CPUState *cpu)

                  if (diff < INT64_MAX) {
                      uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
-                    struct timespec *ts = &cpu->hvf->ts;
-
-                    *ts = (struct timespec){
+                    struct timespec ts = {
                          .tv_sec = ns / NANOSECONDS_PER_SECOND,
                          .tv_nsec = ns % NANOSECONDS_PER_SECOND,
                      };
@@ -526,27 +529,31 @@ int hvf_vcpu_exec(CPUState *cpu)
                       * Waking up easily takes 1ms, don't go to sleep 
for smaller
                       * time periods than 2ms.
                       */
-                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
+                    if (!ts.tv_sec && (ts.tv_nsec < (SCALE_MS * 2))) {
                          advance_pc = true;
                          break;
                      }

+                    /* block SIG_IPI for the sleep */
+                    hvf_block_sig_ipi(cpu);
+                    cpu->thread_kicked = false;
+
                      /* Set cpu->hvf->sleeping so that we get a SIG_IPI 
signal. */
-                    cpu->hvf->sleeping = true;
-                    smp_mb();
+                    qatomic_set(&cpu->hvf->sleeping, true);

-                    /* Bail out if we received an IRQ meanwhile */
-                    if (cpu->thread_kicked || (cpu->interrupt_request &
-                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
-                        cpu->hvf->sleeping = false;
+                    /* Bail out if we received a kick meanwhile */
+                    if (qatomic_read(&cpu->interrupt_request) & irq_mask) {
+ qatomic_set(&cpu->hvf->sleeping, false);
+                        hvf_unblock_sig_ipi(cpu);
                          break;
                      }

-                    /* nanosleep returns on signal, so we wake up on 
kick. */
-                    nanosleep(ts, NULL);
+                    /* pselect returns on kick signal and consumes it */
+                    pselect(0, 0, 0, 0, &ts, &cpu->hvf->sigmask);

                      /* Out of sleep - either naturally or because of a 
kick */
-                    cpu->hvf->sleeping = false;
+                    qatomic_set(&cpu->hvf->sleeping, false);
+                    hvf_unblock_sig_ipi(cpu);
                  }

                  advance_pc = true;


Re: [PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Peter Collingbourne 3 years, 5 months ago
On Tue, Dec 1, 2020 at 8:26 AM Alexander Graf <agraf@csgraf.de> wrote:
>
>
> On 01.12.20 09:21, Peter Collingbourne wrote:
> > Sleep on WFx until the VTIMER is due but allow ourselves to be woken
> > up on IPI.
> >
> > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > ---
> > Alexander Graf wrote:
> >> I would love to take a patch from you here :). I'll still be stuck for a
> >> while with the sysreg sync rework that Peter asked for before I can look
> >> at WFI again.
> > Okay, here's a patch :) It's a relatively straightforward adaptation
> > of what we have in our fork, which can now boot Android to GUI while
> > remaining at around 4% CPU when idle.
> >
> > I'm not set up to boot a full Linux distribution at the moment so I
> > tested it on upstream QEMU by running a recent mainline Linux kernel
> > with a rootfs containing an init program that just does sleep(5)
> > and verified that the qemu process remains at low CPU usage during
> > the sleep. This was on top of your v2 plus the last patch of your v1
> > since it doesn't look like you have a replacement for that logic yet.
>
>
> How about something like this instead?
>
>
> Alex
>
>
> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
> index 4360f64671..50384013ea 100644
> --- a/accel/hvf/hvf-cpus.c
> +++ b/accel/hvf/hvf-cpus.c
> @@ -337,16 +337,18 @@ static int hvf_init_vcpu(CPUState *cpu)
>       cpu->hvf = g_malloc0(sizeof(*cpu->hvf));
>
>       /* init cpu signals */
> -    sigset_t set;
>       struct sigaction sigact;
>
>       memset(&sigact, 0, sizeof(sigact));
>       sigact.sa_handler = dummy_signal;
>       sigaction(SIG_IPI, &sigact, NULL);
>
> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
> -    sigdelset(&set, SIG_IPI);
> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask);
> +    sigdelset(&cpu->hvf->sigmask, SIG_IPI);
> +    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL);
> +
> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask_ipi);
> +    sigaddset(&cpu->hvf->sigmask_ipi, SIG_IPI);

There's no reason to unblock SIG_IPI while not in pselect and it can
easily lead to missed wakeups. The whole point of pselect is so that
you can guarantee that only one part of your program sees signals
without a possibility of them being missed.

>
>   #ifdef __aarch64__
>       r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t
> **)&cpu->hvf->exit, NULL);
> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
> index c56baa3ae8..6e237f2db0 100644
> --- a/include/sysemu/hvf_int.h
> +++ b/include/sysemu/hvf_int.h
> @@ -62,8 +62,9 @@ extern HVFState *hvf_state;
>   struct hvf_vcpu_state {
>       uint64_t fd;
>       void *exit;
> -    struct timespec ts;
>       bool sleeping;
> +    sigset_t sigmask;
> +    sigset_t sigmask_ipi;
>   };
>
>   void assert_hvf_ok(hv_return_t ret);
> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
> index 0c01a03725..350b845e6e 100644
> --- a/target/arm/hvf/hvf.c
> +++ b/target/arm/hvf/hvf.c
> @@ -320,20 +320,24 @@ int hvf_arch_init_vcpu(CPUState *cpu)
>
>   void hvf_kick_vcpu_thread(CPUState *cpu)
>   {
> -    if (cpu->hvf->sleeping) {
> -        /*
> -         * When sleeping, make sure we always send signals. Also, clear the
> -         * timespec, so that an IPI that arrives between setting
> hvf->sleeping
> -         * and the nanosleep syscall still aborts the sleep.
> -         */
> -        cpu->thread_kicked = false;
> -        cpu->hvf->ts = (struct timespec){ };
> +    if (qatomic_read(&cpu->hvf->sleeping)) {
> +        /* When sleeping, send a signal to get out of pselect */
>           cpus_kick_thread(cpu);
>       } else {
>           hv_vcpus_exit(&cpu->hvf->fd, 1);
>       }
>   }
>
> +static void hvf_block_sig_ipi(CPUState *cpu)
> +{
> +    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask_ipi, NULL);
> +}
> +
> +static void hvf_unblock_sig_ipi(CPUState *cpu)
> +{
> +    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL);
> +}
> +
>   static int hvf_inject_interrupts(CPUState *cpu)
>   {
>       if (cpu->interrupt_request & CPU_INTERRUPT_FIQ) {
> @@ -354,6 +358,7 @@ int hvf_vcpu_exec(CPUState *cpu)
>       ARMCPU *arm_cpu = ARM_CPU(cpu);
>       CPUARMState *env = &arm_cpu->env;
>       hv_vcpu_exit_t *hvf_exit = cpu->hvf->exit;
> +    const uint32_t irq_mask = CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ;
>       hv_return_t r;
>       int ret = 0;
>
> @@ -491,8 +496,8 @@ int hvf_vcpu_exec(CPUState *cpu)
>               break;
>           }
>           case EC_WFX_TRAP:
> -            if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
> -                (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> +            if (!(syndrome & WFX_IS_WFE) &&
> +                !(cpu->interrupt_request & irq_mask)) {
>                   uint64_t cval, ctl, val, diff, now;

I don't think the access to cpu->interrupt_request is safe because it
is done while not under the iothread lock. That's why to avoid these
types of issues I would prefer to hold the lock almost all of the
time.

>                   /* Set up a local timer for vtimer if necessary ... */
> @@ -515,9 +520,7 @@ int hvf_vcpu_exec(CPUState *cpu)
>
>                   if (diff < INT64_MAX) {
>                       uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
> -                    struct timespec *ts = &cpu->hvf->ts;
> -
> -                    *ts = (struct timespec){
> +                    struct timespec ts = {
>                           .tv_sec = ns / NANOSECONDS_PER_SECOND,
>                           .tv_nsec = ns % NANOSECONDS_PER_SECOND,
>                       };
> @@ -526,27 +529,31 @@ int hvf_vcpu_exec(CPUState *cpu)
>                        * Waking up easily takes 1ms, don't go to sleep
> for smaller
>                        * time periods than 2ms.
>                        */
> -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
> +                    if (!ts.tv_sec && (ts.tv_nsec < (SCALE_MS * 2))) {
>                           advance_pc = true;
>                           break;
>                       }
>
> +                    /* block SIG_IPI for the sleep */
> +                    hvf_block_sig_ipi(cpu);
> +                    cpu->thread_kicked = false;
> +
>                       /* Set cpu->hvf->sleeping so that we get a SIG_IPI
> signal. */
> -                    cpu->hvf->sleeping = true;
> -                    smp_mb();
> +                    qatomic_set(&cpu->hvf->sleeping, true);

This doesn't protect against races because another thread could call
kvf_vcpu_kick_thread() at any time between when we return from
hv_vcpu_run() and when we set sleeping = true and we would miss the
wakeup (due to kvf_vcpu_kick_thread() seeing sleeping = false and
calling hv_vcpus_exit() instead of pthread_kill()). I don't think it
can be fixed by setting sleeping to true earlier either because no
matter how early you move it, there will always be a window where we
are going to pselect() but sleeping is false, resulting in a missed
wakeup.

Peter

>
> -                    /* Bail out if we received an IRQ meanwhile */
> -                    if (cpu->thread_kicked || (cpu->interrupt_request &
> -                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> -                        cpu->hvf->sleeping = false;
> +                    /* Bail out if we received a kick meanwhile */
> +                    if (qatomic_read(&cpu->interrupt_request) & irq_mask) {
> + qatomic_set(&cpu->hvf->sleeping, false);
> +                        hvf_unblock_sig_ipi(cpu);
>                           break;
>                       }
>
> -                    /* nanosleep returns on signal, so we wake up on
> kick. */
> -                    nanosleep(ts, NULL);
> +                    /* pselect returns on kick signal and consumes it */
> +                    pselect(0, 0, 0, 0, &ts, &cpu->hvf->sigmask);
>
>                       /* Out of sleep - either naturally or because of a
> kick */
> -                    cpu->hvf->sleeping = false;
> +                    qatomic_set(&cpu->hvf->sleeping, false);
> +                    hvf_unblock_sig_ipi(cpu);
>                   }
>
>                   advance_pc = true;
>

Re: [PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Alexander Graf 3 years, 5 months ago
On 01.12.20 21:03, Peter Collingbourne wrote:
> On Tue, Dec 1, 2020 at 8:26 AM Alexander Graf <agraf@csgraf.de> wrote:
>>
>> On 01.12.20 09:21, Peter Collingbourne wrote:
>>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
>>> up on IPI.
>>>
>>> Signed-off-by: Peter Collingbourne <pcc@google.com>
>>> ---
>>> Alexander Graf wrote:
>>>> I would love to take a patch from you here :). I'll still be stuck for a
>>>> while with the sysreg sync rework that Peter asked for before I can look
>>>> at WFI again.
>>> Okay, here's a patch :) It's a relatively straightforward adaptation
>>> of what we have in our fork, which can now boot Android to GUI while
>>> remaining at around 4% CPU when idle.
>>>
>>> I'm not set up to boot a full Linux distribution at the moment so I
>>> tested it on upstream QEMU by running a recent mainline Linux kernel
>>> with a rootfs containing an init program that just does sleep(5)
>>> and verified that the qemu process remains at low CPU usage during
>>> the sleep. This was on top of your v2 plus the last patch of your v1
>>> since it doesn't look like you have a replacement for that logic yet.
>>
>> How about something like this instead?
>>
>>
>> Alex
>>
>>
>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
>> index 4360f64671..50384013ea 100644
>> --- a/accel/hvf/hvf-cpus.c
>> +++ b/accel/hvf/hvf-cpus.c
>> @@ -337,16 +337,18 @@ static int hvf_init_vcpu(CPUState *cpu)
>>        cpu->hvf = g_malloc0(sizeof(*cpu->hvf));
>>
>>        /* init cpu signals */
>> -    sigset_t set;
>>        struct sigaction sigact;
>>
>>        memset(&sigact, 0, sizeof(sigact));
>>        sigact.sa_handler = dummy_signal;
>>        sigaction(SIG_IPI, &sigact, NULL);
>>
>> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
>> -    sigdelset(&set, SIG_IPI);
>> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
>> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask);
>> +    sigdelset(&cpu->hvf->sigmask, SIG_IPI);
>> +    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL);
>> +
>> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask_ipi);
>> +    sigaddset(&cpu->hvf->sigmask_ipi, SIG_IPI);
> There's no reason to unblock SIG_IPI while not in pselect and it can
> easily lead to missed wakeups. The whole point of pselect is so that
> you can guarantee that only one part of your program sees signals
> without a possibility of them being missed.


Hm, I think I start to agree with you here :). We can probably just 
leave SIG_IPI masked at all times and only unmask on pselect. The worst 
thing that will happen is a premature wakeup if we did get an IPI 
incoming while hvf->sleeping is set, but were either not running 
pselect() yet and bailed out or already finished pselect() execution.


>
>>    #ifdef __aarch64__
>>        r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t
>> **)&cpu->hvf->exit, NULL);
>> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
>> index c56baa3ae8..6e237f2db0 100644
>> --- a/include/sysemu/hvf_int.h
>> +++ b/include/sysemu/hvf_int.h
>> @@ -62,8 +62,9 @@ extern HVFState *hvf_state;
>>    struct hvf_vcpu_state {
>>        uint64_t fd;
>>        void *exit;
>> -    struct timespec ts;
>>        bool sleeping;
>> +    sigset_t sigmask;
>> +    sigset_t sigmask_ipi;
>>    };
>>
>>    void assert_hvf_ok(hv_return_t ret);
>> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
>> index 0c01a03725..350b845e6e 100644
>> --- a/target/arm/hvf/hvf.c
>> +++ b/target/arm/hvf/hvf.c
>> @@ -320,20 +320,24 @@ int hvf_arch_init_vcpu(CPUState *cpu)
>>
>>    void hvf_kick_vcpu_thread(CPUState *cpu)
>>    {
>> -    if (cpu->hvf->sleeping) {
>> -        /*
>> -         * When sleeping, make sure we always send signals. Also, clear the
>> -         * timespec, so that an IPI that arrives between setting
>> hvf->sleeping
>> -         * and the nanosleep syscall still aborts the sleep.
>> -         */
>> -        cpu->thread_kicked = false;
>> -        cpu->hvf->ts = (struct timespec){ };
>> +    if (qatomic_read(&cpu->hvf->sleeping)) {
>> +        /* When sleeping, send a signal to get out of pselect */
>>            cpus_kick_thread(cpu);
>>        } else {
>>            hv_vcpus_exit(&cpu->hvf->fd, 1);
>>        }
>>    }
>>
>> +static void hvf_block_sig_ipi(CPUState *cpu)
>> +{
>> +    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask_ipi, NULL);
>> +}
>> +
>> +static void hvf_unblock_sig_ipi(CPUState *cpu)
>> +{
>> +    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL);
>> +}
>> +
>>    static int hvf_inject_interrupts(CPUState *cpu)
>>    {
>>        if (cpu->interrupt_request & CPU_INTERRUPT_FIQ) {
>> @@ -354,6 +358,7 @@ int hvf_vcpu_exec(CPUState *cpu)
>>        ARMCPU *arm_cpu = ARM_CPU(cpu);
>>        CPUARMState *env = &arm_cpu->env;
>>        hv_vcpu_exit_t *hvf_exit = cpu->hvf->exit;
>> +    const uint32_t irq_mask = CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ;
>>        hv_return_t r;
>>        int ret = 0;
>>
>> @@ -491,8 +496,8 @@ int hvf_vcpu_exec(CPUState *cpu)
>>                break;
>>            }
>>            case EC_WFX_TRAP:
>> -            if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
>> -                (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
>> +            if (!(syndrome & WFX_IS_WFE) &&
>> +                !(cpu->interrupt_request & irq_mask)) {
>>                    uint64_t cval, ctl, val, diff, now;
> I don't think the access to cpu->interrupt_request is safe because it
> is done while not under the iothread lock. That's why to avoid these
> types of issues I would prefer to hold the lock almost all of the
> time.


In this branch, that's not a problem yet. On stale values, we either 
don't sleep (which is ok), or we go into the sleep path, and reevaluate 
cpu->interrupt_request atomically again after setting hvf->sleeping.


>
>>                    /* Set up a local timer for vtimer if necessary ... */
>> @@ -515,9 +520,7 @@ int hvf_vcpu_exec(CPUState *cpu)
>>
>>                    if (diff < INT64_MAX) {
>>                        uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
>> -                    struct timespec *ts = &cpu->hvf->ts;
>> -
>> -                    *ts = (struct timespec){
>> +                    struct timespec ts = {
>>                            .tv_sec = ns / NANOSECONDS_PER_SECOND,
>>                            .tv_nsec = ns % NANOSECONDS_PER_SECOND,
>>                        };
>> @@ -526,27 +529,31 @@ int hvf_vcpu_exec(CPUState *cpu)
>>                         * Waking up easily takes 1ms, don't go to sleep
>> for smaller
>>                         * time periods than 2ms.
>>                         */
>> -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
>> +                    if (!ts.tv_sec && (ts.tv_nsec < (SCALE_MS * 2))) {
>>                            advance_pc = true;
>>                            break;
>>                        }
>>
>> +                    /* block SIG_IPI for the sleep */
>> +                    hvf_block_sig_ipi(cpu);
>> +                    cpu->thread_kicked = false;
>> +
>>                        /* Set cpu->hvf->sleeping so that we get a SIG_IPI
>> signal. */
>> -                    cpu->hvf->sleeping = true;
>> -                    smp_mb();
>> +                    qatomic_set(&cpu->hvf->sleeping, true);
> This doesn't protect against races because another thread could call
> kvf_vcpu_kick_thread() at any time between when we return from
> hv_vcpu_run() and when we set sleeping = true and we would miss the
> wakeup (due to kvf_vcpu_kick_thread() seeing sleeping = false and
> calling hv_vcpus_exit() instead of pthread_kill()). I don't think it
> can be fixed by setting sleeping to true earlier either because no
> matter how early you move it, there will always be a window where we
> are going to pselect() but sleeping is false, resulting in a missed
> wakeup.


I don't follow. If anyone was sending us an IPI, it's because they want 
to notify us about an update to cpu->interrupt_request, right? In that 
case, the atomic read of that field below will catch it and bail out of 
the sleep sequence.


>
> Peter
>
>> -                    /* Bail out if we received an IRQ meanwhile */
>> -                    if (cpu->thread_kicked || (cpu->interrupt_request &
>> -                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
>> -                        cpu->hvf->sleeping = false;
>> +                    /* Bail out if we received a kick meanwhile */
>> +                    if (qatomic_read(&cpu->interrupt_request) & irq_mask) {
>> + qatomic_set(&cpu->hvf->sleeping, false);


^^^


Alex


>> +                        hvf_unblock_sig_ipi(cpu);
>>                            break;
>>                        }
>>
>> -                    /* nanosleep returns on signal, so we wake up on
>> kick. */
>> -                    nanosleep(ts, NULL);
>> +                    /* pselect returns on kick signal and consumes it */
>> +                    pselect(0, 0, 0, 0, &ts, &cpu->hvf->sigmask);
>>
>>                        /* Out of sleep - either naturally or because of a
>> kick */
>> -                    cpu->hvf->sleeping = false;
>> +                    qatomic_set(&cpu->hvf->sleeping, false);
>> +                    hvf_unblock_sig_ipi(cpu);
>>                    }
>>
>>                    advance_pc = true;
>>

Re: [PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Alexander Graf 3 years, 5 months ago
On 01.12.20 23:09, Alexander Graf wrote:
>
> On 01.12.20 21:03, Peter Collingbourne wrote:
>> On Tue, Dec 1, 2020 at 8:26 AM Alexander Graf <agraf@csgraf.de> wrote:
>>>
>>> On 01.12.20 09:21, Peter Collingbourne wrote:
>>>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
>>>> up on IPI.
>>>>
>>>> Signed-off-by: Peter Collingbourne <pcc@google.com>
>>>> ---
>>>> Alexander Graf wrote:
>>>>> I would love to take a patch from you here :). I'll still be stuck 
>>>>> for a
>>>>> while with the sysreg sync rework that Peter asked for before I 
>>>>> can look
>>>>> at WFI again.
>>>> Okay, here's a patch :) It's a relatively straightforward adaptation
>>>> of what we have in our fork, which can now boot Android to GUI while
>>>> remaining at around 4% CPU when idle.
>>>>
>>>> I'm not set up to boot a full Linux distribution at the moment so I
>>>> tested it on upstream QEMU by running a recent mainline Linux kernel
>>>> with a rootfs containing an init program that just does sleep(5)
>>>> and verified that the qemu process remains at low CPU usage during
>>>> the sleep. This was on top of your v2 plus the last patch of your v1
>>>> since it doesn't look like you have a replacement for that logic yet.
>>>
>>> How about something like this instead?
>>>
>>>
>>> Alex
>>>
>>>
>>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
>>> index 4360f64671..50384013ea 100644
>>> --- a/accel/hvf/hvf-cpus.c
>>> +++ b/accel/hvf/hvf-cpus.c
>>> @@ -337,16 +337,18 @@ static int hvf_init_vcpu(CPUState *cpu)
>>>        cpu->hvf = g_malloc0(sizeof(*cpu->hvf));
>>>
>>>        /* init cpu signals */
>>> -    sigset_t set;
>>>        struct sigaction sigact;
>>>
>>>        memset(&sigact, 0, sizeof(sigact));
>>>        sigact.sa_handler = dummy_signal;
>>>        sigaction(SIG_IPI, &sigact, NULL);
>>>
>>> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
>>> -    sigdelset(&set, SIG_IPI);
>>> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
>>> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask);
>>> +    sigdelset(&cpu->hvf->sigmask, SIG_IPI);
>>> +    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL);
>>> +
>>> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask_ipi);
>>> +    sigaddset(&cpu->hvf->sigmask_ipi, SIG_IPI);
>> There's no reason to unblock SIG_IPI while not in pselect and it can
>> easily lead to missed wakeups. The whole point of pselect is so that
>> you can guarantee that only one part of your program sees signals
>> without a possibility of them being missed.
>
>
> Hm, I think I start to agree with you here :). We can probably just 
> leave SIG_IPI masked at all times and only unmask on pselect. The 
> worst thing that will happen is a premature wakeup if we did get an 
> IPI incoming while hvf->sleeping is set, but were either not running 
> pselect() yet and bailed out or already finished pselect() execution.


How about this one? Do you really think it's still racy?


Alex


diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
index 4360f64671..e10fca622d 100644
--- a/accel/hvf/hvf-cpus.c
+++ b/accel/hvf/hvf-cpus.c
@@ -337,16 +337,17 @@ static int hvf_init_vcpu(CPUState *cpu)
      cpu->hvf = g_malloc0(sizeof(*cpu->hvf));

      /* init cpu signals */
-    sigset_t set;
      struct sigaction sigact;

      memset(&sigact, 0, sizeof(sigact));
      sigact.sa_handler = dummy_signal;
      sigaction(SIG_IPI, &sigact, NULL);

-    pthread_sigmask(SIG_BLOCK, NULL, &set);
-    sigdelset(&set, SIG_IPI);
-    pthread_sigmask(SIG_SETMASK, &set, NULL);
+    /* Remember unmasked IPI mask for pselect(), leave masked normally */
+    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask_ipi);
+    sigaddset(&cpu->hvf->sigmask_ipi, SIG_IPI);
+    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask_ipi, NULL);
+    sigdelset(&cpu->hvf->sigmask_ipi, SIG_IPI);

  #ifdef __aarch64__
      r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t 
**)&cpu->hvf->exit, NULL);
diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
index c56baa3ae8..8d7d4a6226 100644
--- a/include/sysemu/hvf_int.h
+++ b/include/sysemu/hvf_int.h
@@ -62,8 +62,8 @@ extern HVFState *hvf_state;
  struct hvf_vcpu_state {
      uint64_t fd;
      void *exit;
-    struct timespec ts;
      bool sleeping;
+    sigset_t sigmask_ipi;
  };

  void assert_hvf_ok(hv_return_t ret);
diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
index 0c01a03725..a255a1a7d3 100644
--- a/target/arm/hvf/hvf.c
+++ b/target/arm/hvf/hvf.c
@@ -320,14 +320,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)

  void hvf_kick_vcpu_thread(CPUState *cpu)
  {
-    if (cpu->hvf->sleeping) {
-        /*
-         * When sleeping, make sure we always send signals. Also, clear the
-         * timespec, so that an IPI that arrives between setting 
hvf->sleeping
-         * and the nanosleep syscall still aborts the sleep.
-         */
-        cpu->thread_kicked = false;
-        cpu->hvf->ts = (struct timespec){ };
+    if (qatomic_read(&cpu->hvf->sleeping)) {
+        /* When sleeping, send a signal to get out of pselect */
          cpus_kick_thread(cpu);
      } else {
          hv_vcpus_exit(&cpu->hvf->fd, 1);
@@ -354,6 +348,7 @@ int hvf_vcpu_exec(CPUState *cpu)
      ARMCPU *arm_cpu = ARM_CPU(cpu);
      CPUARMState *env = &arm_cpu->env;
      hv_vcpu_exit_t *hvf_exit = cpu->hvf->exit;
+    const uint32_t irq_mask = CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ;
      hv_return_t r;
      int ret = 0;

@@ -491,8 +486,8 @@ int hvf_vcpu_exec(CPUState *cpu)
              break;
          }
          case EC_WFX_TRAP:
-            if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
-                (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
+            if (!(syndrome & WFX_IS_WFE) &&
+                !(cpu->interrupt_request & irq_mask)) {
                  uint64_t cval, ctl, val, diff, now;

                  /* Set up a local timer for vtimer if necessary ... */
@@ -515,9 +510,7 @@ int hvf_vcpu_exec(CPUState *cpu)

                  if (diff < INT64_MAX) {
                      uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
-                    struct timespec *ts = &cpu->hvf->ts;
-
-                    *ts = (struct timespec){
+                    struct timespec ts = {
                          .tv_sec = ns / NANOSECONDS_PER_SECOND,
                          .tv_nsec = ns % NANOSECONDS_PER_SECOND,
                      };
@@ -526,27 +519,27 @@ int hvf_vcpu_exec(CPUState *cpu)
                       * Waking up easily takes 1ms, don't go to sleep 
for smaller
                       * time periods than 2ms.
                       */
-                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
+                    if (!ts.tv_sec && (ts.tv_nsec < (SCALE_MS * 2))) {
                          advance_pc = true;
                          break;
                      }

+                    cpu->thread_kicked = false;
+
                      /* Set cpu->hvf->sleeping so that we get a SIG_IPI 
signal. */
-                    cpu->hvf->sleeping = true;
-                    smp_mb();
+                    qatomic_set(&cpu->hvf->sleeping, true);

-                    /* Bail out if we received an IRQ meanwhile */
-                    if (cpu->thread_kicked || (cpu->interrupt_request &
-                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
-                        cpu->hvf->sleeping = false;
+                    /* Bail out if we received a kick meanwhile */
+                    if (qatomic_read(&cpu->interrupt_request) & irq_mask) {
+ qatomic_set(&cpu->hvf->sleeping, false);
                          break;
                      }

-                    /* nanosleep returns on signal, so we wake up on 
kick. */
-                    nanosleep(ts, NULL);
+                    /* pselect returns on kick signal and consumes it */
+                    pselect(0, 0, 0, 0, &ts, &cpu->hvf->sigmask_ipi);

                      /* Out of sleep - either naturally or because of a 
kick */
-                    cpu->hvf->sleeping = false;
+                    qatomic_set(&cpu->hvf->sleeping, false);
                  }

                  advance_pc = true;


Re: [PATCH] arm/hvf: Optimize and simplify WFI handling
Posted by Peter Collingbourne 3 years, 5 months ago
On Tue, Dec 1, 2020 at 2:09 PM Alexander Graf <agraf@csgraf.de> wrote:
>
>
> On 01.12.20 21:03, Peter Collingbourne wrote:
> > On Tue, Dec 1, 2020 at 8:26 AM Alexander Graf <agraf@csgraf.de> wrote:
> >>
> >> On 01.12.20 09:21, Peter Collingbourne wrote:
> >>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
> >>> up on IPI.
> >>>
> >>> Signed-off-by: Peter Collingbourne <pcc@google.com>
> >>> ---
> >>> Alexander Graf wrote:
> >>>> I would love to take a patch from you here :). I'll still be stuck for a
> >>>> while with the sysreg sync rework that Peter asked for before I can look
> >>>> at WFI again.
> >>> Okay, here's a patch :) It's a relatively straightforward adaptation
> >>> of what we have in our fork, which can now boot Android to GUI while
> >>> remaining at around 4% CPU when idle.
> >>>
> >>> I'm not set up to boot a full Linux distribution at the moment so I
> >>> tested it on upstream QEMU by running a recent mainline Linux kernel
> >>> with a rootfs containing an init program that just does sleep(5)
> >>> and verified that the qemu process remains at low CPU usage during
> >>> the sleep. This was on top of your v2 plus the last patch of your v1
> >>> since it doesn't look like you have a replacement for that logic yet.
> >>
> >> How about something like this instead?
> >>
> >>
> >> Alex
> >>
> >>
> >> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
> >> index 4360f64671..50384013ea 100644
> >> --- a/accel/hvf/hvf-cpus.c
> >> +++ b/accel/hvf/hvf-cpus.c
> >> @@ -337,16 +337,18 @@ static int hvf_init_vcpu(CPUState *cpu)
> >>        cpu->hvf = g_malloc0(sizeof(*cpu->hvf));
> >>
> >>        /* init cpu signals */
> >> -    sigset_t set;
> >>        struct sigaction sigact;
> >>
> >>        memset(&sigact, 0, sizeof(sigact));
> >>        sigact.sa_handler = dummy_signal;
> >>        sigaction(SIG_IPI, &sigact, NULL);
> >>
> >> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
> >> -    sigdelset(&set, SIG_IPI);
> >> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
> >> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask);
> >> +    sigdelset(&cpu->hvf->sigmask, SIG_IPI);
> >> +    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL);
> >> +
> >> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask_ipi);
> >> +    sigaddset(&cpu->hvf->sigmask_ipi, SIG_IPI);
> > There's no reason to unblock SIG_IPI while not in pselect and it can
> > easily lead to missed wakeups. The whole point of pselect is so that
> > you can guarantee that only one part of your program sees signals
> > without a possibility of them being missed.
>
>
> Hm, I think I start to agree with you here :). We can probably just
> leave SIG_IPI masked at all times and only unmask on pselect. The worst
> thing that will happen is a premature wakeup if we did get an IPI
> incoming while hvf->sleeping is set, but were either not running
> pselect() yet and bailed out or already finished pselect() execution.

Ack.

> >
> >>    #ifdef __aarch64__
> >>        r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t
> >> **)&cpu->hvf->exit, NULL);
> >> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
> >> index c56baa3ae8..6e237f2db0 100644
> >> --- a/include/sysemu/hvf_int.h
> >> +++ b/include/sysemu/hvf_int.h
> >> @@ -62,8 +62,9 @@ extern HVFState *hvf_state;
> >>    struct hvf_vcpu_state {
> >>        uint64_t fd;
> >>        void *exit;
> >> -    struct timespec ts;
> >>        bool sleeping;
> >> +    sigset_t sigmask;
> >> +    sigset_t sigmask_ipi;
> >>    };
> >>
> >>    void assert_hvf_ok(hv_return_t ret);
> >> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
> >> index 0c01a03725..350b845e6e 100644
> >> --- a/target/arm/hvf/hvf.c
> >> +++ b/target/arm/hvf/hvf.c
> >> @@ -320,20 +320,24 @@ int hvf_arch_init_vcpu(CPUState *cpu)
> >>
> >>    void hvf_kick_vcpu_thread(CPUState *cpu)
> >>    {
> >> -    if (cpu->hvf->sleeping) {
> >> -        /*
> >> -         * When sleeping, make sure we always send signals. Also, clear the
> >> -         * timespec, so that an IPI that arrives between setting
> >> hvf->sleeping
> >> -         * and the nanosleep syscall still aborts the sleep.
> >> -         */
> >> -        cpu->thread_kicked = false;
> >> -        cpu->hvf->ts = (struct timespec){ };
> >> +    if (qatomic_read(&cpu->hvf->sleeping)) {
> >> +        /* When sleeping, send a signal to get out of pselect */
> >>            cpus_kick_thread(cpu);
> >>        } else {
> >>            hv_vcpus_exit(&cpu->hvf->fd, 1);
> >>        }
> >>    }
> >>
> >> +static void hvf_block_sig_ipi(CPUState *cpu)
> >> +{
> >> +    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask_ipi, NULL);
> >> +}
> >> +
> >> +static void hvf_unblock_sig_ipi(CPUState *cpu)
> >> +{
> >> +    pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL);
> >> +}
> >> +
> >>    static int hvf_inject_interrupts(CPUState *cpu)
> >>    {
> >>        if (cpu->interrupt_request & CPU_INTERRUPT_FIQ) {
> >> @@ -354,6 +358,7 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>        ARMCPU *arm_cpu = ARM_CPU(cpu);
> >>        CPUARMState *env = &arm_cpu->env;
> >>        hv_vcpu_exit_t *hvf_exit = cpu->hvf->exit;
> >> +    const uint32_t irq_mask = CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ;
> >>        hv_return_t r;
> >>        int ret = 0;
> >>
> >> @@ -491,8 +496,8 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>                break;
> >>            }
> >>            case EC_WFX_TRAP:
> >> -            if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
> >> -                (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> >> +            if (!(syndrome & WFX_IS_WFE) &&
> >> +                !(cpu->interrupt_request & irq_mask)) {
> >>                    uint64_t cval, ctl, val, diff, now;
> > I don't think the access to cpu->interrupt_request is safe because it
> > is done while not under the iothread lock. That's why to avoid these
> > types of issues I would prefer to hold the lock almost all of the
> > time.
>
>
> In this branch, that's not a problem yet. On stale values, we either
> don't sleep (which is ok), or we go into the sleep path, and reevaluate
> cpu->interrupt_request atomically again after setting hvf->sleeping.

Okay, this may be a "benign race" (and it may be helped a little by
the M1's sequential consistency extension) but this is the sort of
thing that I'd prefer not to rely on. At least it should be an atomic
read.

> >
> >>                    /* Set up a local timer for vtimer if necessary ... */
> >> @@ -515,9 +520,7 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>
> >>                    if (diff < INT64_MAX) {
> >>                        uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
> >> -                    struct timespec *ts = &cpu->hvf->ts;
> >> -
> >> -                    *ts = (struct timespec){
> >> +                    struct timespec ts = {
> >>                            .tv_sec = ns / NANOSECONDS_PER_SECOND,
> >>                            .tv_nsec = ns % NANOSECONDS_PER_SECOND,
> >>                        };
> >> @@ -526,27 +529,31 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>                         * Waking up easily takes 1ms, don't go to sleep
> >> for smaller
> >>                         * time periods than 2ms.
> >>                         */
> >> -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
> >> +                    if (!ts.tv_sec && (ts.tv_nsec < (SCALE_MS * 2))) {
> >>                            advance_pc = true;
> >>                            break;
> >>                        }
> >>
> >> +                    /* block SIG_IPI for the sleep */
> >> +                    hvf_block_sig_ipi(cpu);
> >> +                    cpu->thread_kicked = false;
> >> +
> >>                        /* Set cpu->hvf->sleeping so that we get a SIG_IPI
> >> signal. */
> >> -                    cpu->hvf->sleeping = true;
> >> -                    smp_mb();
> >> +                    qatomic_set(&cpu->hvf->sleeping, true);
> > This doesn't protect against races because another thread could call
> > kvf_vcpu_kick_thread() at any time between when we return from
> > hv_vcpu_run() and when we set sleeping = true and we would miss the
> > wakeup (due to kvf_vcpu_kick_thread() seeing sleeping = false and
> > calling hv_vcpus_exit() instead of pthread_kill()). I don't think it
> > can be fixed by setting sleeping to true earlier either because no
> > matter how early you move it, there will always be a window where we
> > are going to pselect() but sleeping is false, resulting in a missed
> > wakeup.
>
>
> I don't follow. If anyone was sending us an IPI, it's because they want
> to notify us about an update to cpu->interrupt_request, right? In that
> case, the atomic read of that field below will catch it and bail out of
> the sleep sequence.

I think there are other possible IPI reasons, e.g. set halted to 1,
I/O events. Now we could check for halted below and maybe some of the
others but the code will be subtle and it seems like a game of
whack-a-mole to get them all. This is an example of what I was talking
about when I said that an approach that relies on the sleeping field
will be difficult to get right. I would strongly prefer to start with
a simple approach and maybe we can consider a more complicated one
later.

Peter

>
>
> >
> > Peter
> >
> >> -                    /* Bail out if we received an IRQ meanwhile */
> >> -                    if (cpu->thread_kicked || (cpu->interrupt_request &
> >> -                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> >> -                        cpu->hvf->sleeping = false;
> >> +                    /* Bail out if we received a kick meanwhile */
> >> +                    if (qatomic_read(&cpu->interrupt_request) & irq_mask) {
> >> + qatomic_set(&cpu->hvf->sleeping, false);
>
>
> ^^^
>
>
> Alex
>
>
> >> +                        hvf_unblock_sig_ipi(cpu);
> >>                            break;
> >>                        }
> >>
> >> -                    /* nanosleep returns on signal, so we wake up on
> >> kick. */
> >> -                    nanosleep(ts, NULL);
> >> +                    /* pselect returns on kick signal and consumes it */
> >> +                    pselect(0, 0, 0, 0, &ts, &cpu->hvf->sigmask);
> >>
> >>                        /* Out of sleep - either naturally or because of a
> >> kick */
> >> -                    cpu->hvf->sleeping = false;
> >> +                    qatomic_set(&cpu->hvf->sleeping, false);
> >> +                    hvf_unblock_sig_ipi(cpu);
> >>                    }
> >>
> >>                    advance_pc = true;
> >>