[PATCH] ppc/spapr: Fix RTAS stopped state

Nicholas Piggin posted 1 patch 1 week, 6 days ago
target/ppc/cpu.h         | 11 +++++++++++
hw/ppc/pnv_core.c        |  6 +++++-
hw/ppc/spapr_cpu_core.c  |  6 ++++++
hw/ppc/spapr_rtas.c      |  5 ++++-
target/ppc/excp_helper.c |  4 ++++
5 files changed, 30 insertions(+), 2 deletions(-)
[PATCH] ppc/spapr: Fix RTAS stopped state
Posted by Nicholas Piggin 1 week, 6 days ago
This change takes the CPUPPCState 'quiesced' field added for powernv
hardware CPU core controls (used to stop and start cores), and extends
it to spapr to model the "RTAS stopped" state. This prevents the
schedulers attempting to run stopped CPUs unexpectedly, which can cause
hangs and possibly other unexpected behaviour.

The detail of the problematic situation is this:

A KVM spapr guest boots with all secondary CPUs defined to be in the
"RTAS stopped" state. In this state, the CPU is only responsive to the
start-cpu RTAS call. This behaviour is modeled in QEMU with the
start_powered_off feature, which sets ->halted on secondary CPUs at
boot. ->halted=true looks like an idle / sleep / power-save state which
typically is responsive to asynchronous interrupts, but spapr clears
wake-on-interrupt bits in the LPCR SPR. This more-or-less works.

Commit e8291ec16da8 ("target/ppc: fix timebase register reset state")
recently caused the decrementer to expire sooner at boot, causing a
decrementer exception on secondary CPUs in RTAS stopped state. This
was not a problem on TCG, but KVM limits how a guest can modify LPCR, in
particular it prevents the clearing of wake-on-interrupt bits, and so in
the course of CPU register synchronisation, the LPCR as set by spapr to
model the RTAS stopped state is overwritten with KVM's LPCR value, and
that then causes QEMU's interrupt code to notice the expired decrementer
exception, turn that into an interrupt, and set CPU_INTERRUPT_HARD.

That causes the CPU to be kicked, and the KVM vCPU thread to loop
calling kvm_cpu_exec(). kvm_cpu_exec() calls
kvm_arch_process_async_events(), which on ppc just returns ->halted.
This is still true, so it returns immediately with EXCP_HLT, and the
vCPU never goes to sleep because qemu_wait_io_event() sees
CPU_INTERRUPT_HARD is set. All this while the vCPU holds the bql.  This
causes the boot CPU to eventually lock up when it needs the bql.

So make 'quiesced' represent the "RTAS stopped" state, and have it
explicitly not respond to exceptions (interrupt conditions) rather than
rely on machine register state to model that state. This matches the
powernv quiesced state very well because it essentially turns off the
CPU core via a side-band control unit.

There are still issues with QEMU and KVM idea of LPCR diverging and that
is quite ugly and fragile that should be fixed. spapr should synchronize
its LPCR properly with KVM, and not try to use values that KVM does not
support.

Reported-by: Misbah Anjum N <misanjum@linux.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 target/ppc/cpu.h         | 11 +++++++++++
 hw/ppc/pnv_core.c        |  6 +++++-
 hw/ppc/spapr_cpu_core.c  |  6 ++++++
 hw/ppc/spapr_rtas.c      |  5 ++++-
 target/ppc/excp_helper.c |  4 ++++
 5 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index efab54a0683..3ee83517dcd 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1356,6 +1356,17 @@ struct CPUArchState {
      * special way (such as routing some resume causes to 0x100, i.e. sreset).
      */
     bool resume_as_sreset;
+
+    /*
+     * On powernv, quiesced means the CPU has been stopped using PC direct
+     * control xscom registers.
+     *
+     * On spapr, quiesced means it is in the "RTAS stopped" state.
+     *
+     * The core halted/stopped variables aren't sufficient for this, because
+     * they can be changed with various side-band operations like qmp cont,
+     * powersave interrupts, etc.
+     */
     bool quiesced;
 #endif
 
diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index 99d9644ee38..a33977da188 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -248,21 +248,25 @@ static void pnv_core_power10_xscom_write(void *opaque, hwaddr addr,
 
             if (val & PPC_BIT(7 + 8 * i)) { /* stop */
                 val &= ~PPC_BIT(7 + 8 * i);
-                cpu_pause(cs);
                 env->quiesced = true;
+                ppc_maybe_interrupt(env);
+                cpu_pause(cs);
             }
             if (val & PPC_BIT(6 + 8 * i)) { /* start */
                 val &= ~PPC_BIT(6 + 8 * i);
                 env->quiesced = false;
+                ppc_maybe_interrupt(env);
                 cpu_resume(cs);
             }
             if (val & PPC_BIT(4 + 8 * i)) { /* sreset */
                 val &= ~PPC_BIT(4 + 8 * i);
                 env->quiesced = false;
+                ppc_maybe_interrupt(env);
                 pnv_cpu_do_nmi_resume(cs);
             }
             if (val & PPC_BIT(3 + 8 * i)) { /* clear maint */
                 env->quiesced = false;
+                ppc_maybe_interrupt(env);
                 /*
                  * Hardware has very particular cases for where clear maint
                  * must be used and where start must be used to resume a
diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
index 0671d9e44b4..faf9170ba6b 100644
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@@ -37,6 +37,9 @@ static void spapr_reset_vcpu(PowerPCCPU *cpu)
 
     cpu_reset(cs);
 
+    env->quiesced = true; /* set "RTAS stopped" state. */
+    ppc_maybe_interrupt(env);
+
     /*
      * "PowerPC Processor binding to IEEE 1275" defines the initial MSR state
      * as 32bit (MSR_SF=0) with MSR_ME=1 and MSR_FP=1 in "8.2.1. Initial
@@ -98,6 +101,9 @@ void spapr_cpu_set_entry_state(PowerPCCPU *cpu, target_ulong nip,
     CPU(cpu)->halted = 0;
     /* Enable Power-saving mode Exit Cause exceptions */
     ppc_store_lpcr(cpu, env->spr[SPR_LPCR] | pcc->lpcr_pm);
+
+    env->quiesced = false; /* clear "RTAS stopped" state. */
+    ppc_maybe_interrupt(env);
 }
 
 /*
diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
index 503d441b48e..78309dbb09d 100644
--- a/hw/ppc/spapr_rtas.c
+++ b/hw/ppc/spapr_rtas.c
@@ -110,7 +110,8 @@ static void rtas_query_cpu_stopped_state(PowerPCCPU *cpu_,
     id = rtas_ld(args, 0);
     cpu = spapr_find_cpu(id);
     if (cpu != NULL) {
-        if (CPU(cpu)->halted) {
+        CPUPPCState *env = &cpu->env;
+        if (env->quiesced) {
             rtas_st(rets, 1, 0);
         } else {
             rtas_st(rets, 1, 2);
@@ -215,6 +216,8 @@ static void rtas_stop_self(PowerPCCPU *cpu, SpaprMachineState *spapr,
      * For the same reason, set PSSCR_EC.
      */
     env->spr[SPR_PSSCR] |= PSSCR_EC;
+    env->quiesced = true; /* set "RTAS stopped" state. */
+    ppc_maybe_interrupt(env);
     cs->halted = 1;
     ppc_store_lpcr(cpu, env->spr[SPR_LPCR] & ~pcc->lpcr_pm);
     kvmppc_set_reg_ppc_online(cpu, 0);
diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 44e19aacd8d..c941c89806e 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1951,6 +1951,10 @@ static int ppc_next_unmasked_interrupt(CPUPPCState *env)
     target_ulong lpcr = env->spr[SPR_LPCR];
     bool async_deliver;
 
+    if (unlikely(env->quiesced)) {
+        return 0;
+    }
+
 #ifdef TARGET_PPC64
     switch (env->excp_model) {
     case POWERPC_EXCP_POWER7:
-- 
2.47.1
Re: [PATCH] ppc/spapr: Fix RTAS stopped state
Posted by Misbah Anjum N 1 week, 6 days ago
Thanks for the quick fix.
I tested the patch and I am successfully able to boot the KVM guest.


Console Logs:
# b4 am 20250320043443.88829-1-npiggin@gmail.com --outdir patches
Analyzing 1 messages in the thread
Analyzing 0 code-review messages
Checking attestation on all messages, may take a moment...
---
   ✓ [PATCH] ppc/spapr: Fix RTAS stopped state
   ---
   ✓ Signed: DKIM/gmail.com
---
Total patches: 1
---
  Link: 
https://lore.kernel.org/qemu-devel/20250320043443.88829-1-npiggin@gmail.com

# git am patches/*.mbx
Applying: ppc/spapr: Fix RTAS stopped state

# /usr/bin/qemu-system-ppc64   -name avocado-vt-vm1   -machine 
pseries,accel=kvm   -m 32768   -smp 32,sockets=1,cores=32,threads=1   
-nographic   -device virtio-scsi-pci,id=scsi   -drive 
file=/home/kvmci/tests/data/avocado-vt/images/rhel8.0devel-ppc64le.qcow2,if=none,id=drive0,format=qcow2 
   -device scsi-hd,drive=drive0,bus=scsi.0   -netdev 
bridge,id=net0,br=virbr0   -device virtio-net-pci,netdev=net0   -serial 
pty   -device virtio-balloon-pci   -cpu host
QEMU 9.2.90 monitor - type 'help' for more information
char device redirected to /dev/pts/1 (label serial0)
(qemu)

(In another ssh session)
# screen /dev/pts/1
Preparing to boot Linux version 6.10.4-200.fc40.ppc64le 
(mockbuild@c23cc4e677614c34bb22d54eeea4dc1f) (gcc (GCC) 14.2.1 20240801 
(Red Hat 14.2.1-1), GNU ld version 2.41-37.fc40) #1 SMP Sun Aug 11 
15:20:17 UTC 2024
Detected machine type: 0000000000000101
command line: 
BOOT_IMAGE=(ieee1275/disk,msdos2)/vmlinuz-6.10.4-200.fc40.ppc64le 
root=/dev/mapper/fedora-root ro rd.lvm.lv=fedora/root crashkernel=1024M
Max number of cores passed to firmware: 2048 (NR_CPUS = 2048)
Calling ibm,client-architecture-support... done
memory layout at init:
   memory_limit : 0000000000000000 (16 MB aligned)
   alloc_bottom : 0000000008200000
   alloc_top    : 0000000030000000
   alloc_top_hi : 0000000800000000
   rmo_top      : 0000000030000000
   ram_top      : 0000000800000000
found display   : /pci@800000020000000/vga@0, opening... done
instantiating rtas at 0x000000002fff0000... done
prom_hold_cpus: skipped
copying OF device tree...
Building dt strings...
Building dt structure...
Device tree strings 0x0000000008210000 -> 0x0000000008210c0d
Device tree struct  0x0000000008220000 -> 0x0000000008230000
Quiescing Open Firmware ...
Booting Linux via __start() @ 0x0000000000440000 ...
[    0.000000] random: crng init done
[    0.000000] Reserving 1024MB of memory at 512MB for crashkernel 
(System RAM: 32768MB)
[    0.000000] radix-mmu: Page sizes from device-tree:
[    0.000000] radix-mmu: Page size shift = 12 AP=0x0
[    0.000000] radix-mmu: Page size shift = 16 AP=0x5
[    0.000000] radix-mmu: Page size shift = 21 AP=0x1
[    0.000000] radix-mmu: Page size shift = 30 AP=0x2
[    0.000000] Activating Kernel Userspace Access Prevention
[    0.000000] Activating Kernel Userspace Execution Prevention
[    0.000000] radix-mmu: Mapped 0x0000000000000000-0x0000000003950000 
with 64.0 KiB pages (exec)
[    0.000000] radix-mmu: Mapped 0x0000000003950000-0x0000000800000000 
with 64.0 KiB pages
[    0.000000] lpar: Using radix MMU under hypervisor
[    0.000000] Linux version 6.10.4-200.fc40.ppc64le 
(mockbuild@c23cc4e677614c34bb22d54eeea4dc1f) (gcc (GCC) 14.2.1 20240801 
(Red Hat 14.2.1-1), GNU ld version 2.41-37.fc40) #1 SMP Sun Aug 11 
15:20:17 UTC 2024
...
...
** Boot Successful


Tested-by: Misbah Anjum N <misanjum@linux.ibm.com>

Thanks,
Misbah Anjum N

On 2025-03-20 10:04, Nicholas Piggin wrote:
> This change takes the CPUPPCState 'quiesced' field added for powernv
> hardware CPU core controls (used to stop and start cores), and extends
> it to spapr to model the "RTAS stopped" state. This prevents the
> schedulers attempting to run stopped CPUs unexpectedly, which can cause
> hangs and possibly other unexpected behaviour.
> 
> The detail of the problematic situation is this:
> 
> A KVM spapr guest boots with all secondary CPUs defined to be in the
> "RTAS stopped" state. In this state, the CPU is only responsive to the
> start-cpu RTAS call. This behaviour is modeled in QEMU with the
> start_powered_off feature, which sets ->halted on secondary CPUs at
> boot. ->halted=true looks like an idle / sleep / power-save state which
> typically is responsive to asynchronous interrupts, but spapr clears
> wake-on-interrupt bits in the LPCR SPR. This more-or-less works.
> 
> Commit e8291ec16da8 ("target/ppc: fix timebase register reset state")
> recently caused the decrementer to expire sooner at boot, causing a
> decrementer exception on secondary CPUs in RTAS stopped state. This
> was not a problem on TCG, but KVM limits how a guest can modify LPCR, 
> in
> particular it prevents the clearing of wake-on-interrupt bits, and so 
> in
> the course of CPU register synchronisation, the LPCR as set by spapr to
> model the RTAS stopped state is overwritten with KVM's LPCR value, and
> that then causes QEMU's interrupt code to notice the expired 
> decrementer
> exception, turn that into an interrupt, and set CPU_INTERRUPT_HARD.
> 
> That causes the CPU to be kicked, and the KVM vCPU thread to loop
> calling kvm_cpu_exec(). kvm_cpu_exec() calls
> kvm_arch_process_async_events(), which on ppc just returns ->halted.
> This is still true, so it returns immediately with EXCP_HLT, and the
> vCPU never goes to sleep because qemu_wait_io_event() sees
> CPU_INTERRUPT_HARD is set. All this while the vCPU holds the bql.  This
> causes the boot CPU to eventually lock up when it needs the bql.
> 
> So make 'quiesced' represent the "RTAS stopped" state, and have it
> explicitly not respond to exceptions (interrupt conditions) rather than
> rely on machine register state to model that state. This matches the
> powernv quiesced state very well because it essentially turns off the
> CPU core via a side-band control unit.
> 
> There are still issues with QEMU and KVM idea of LPCR diverging and 
> that
> is quite ugly and fragile that should be fixed. spapr should 
> synchronize
> its LPCR properly with KVM, and not try to use values that KVM does not
> support.
> 
> Reported-by: Misbah Anjum N <misanjum@linux.ibm.com>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  target/ppc/cpu.h         | 11 +++++++++++
>  hw/ppc/pnv_core.c        |  6 +++++-
>  hw/ppc/spapr_cpu_core.c  |  6 ++++++
>  hw/ppc/spapr_rtas.c      |  5 ++++-
>  target/ppc/excp_helper.c |  4 ++++
>  5 files changed, 30 insertions(+), 2 deletions(-)
> 
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index efab54a0683..3ee83517dcd 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -1356,6 +1356,17 @@ struct CPUArchState {
>       * special way (such as routing some resume causes to 0x100, i.e. 
> sreset).
>       */
>      bool resume_as_sreset;
> +
> +    /*
> +     * On powernv, quiesced means the CPU has been stopped using PC 
> direct
> +     * control xscom registers.
> +     *
> +     * On spapr, quiesced means it is in the "RTAS stopped" state.
> +     *
> +     * The core halted/stopped variables aren't sufficient for this, 
> because
> +     * they can be changed with various side-band operations like qmp 
> cont,
> +     * powersave interrupts, etc.
> +     */
>      bool quiesced;
>  #endif
> 
> diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
> index 99d9644ee38..a33977da188 100644
> --- a/hw/ppc/pnv_core.c
> +++ b/hw/ppc/pnv_core.c
> @@ -248,21 +248,25 @@ static void pnv_core_power10_xscom_write(void
> *opaque, hwaddr addr,
> 
>              if (val & PPC_BIT(7 + 8 * i)) { /* stop */
>                  val &= ~PPC_BIT(7 + 8 * i);
> -                cpu_pause(cs);
>                  env->quiesced = true;
> +                ppc_maybe_interrupt(env);
> +                cpu_pause(cs);
>              }
>              if (val & PPC_BIT(6 + 8 * i)) { /* start */
>                  val &= ~PPC_BIT(6 + 8 * i);
>                  env->quiesced = false;
> +                ppc_maybe_interrupt(env);
>                  cpu_resume(cs);
>              }
>              if (val & PPC_BIT(4 + 8 * i)) { /* sreset */
>                  val &= ~PPC_BIT(4 + 8 * i);
>                  env->quiesced = false;
> +                ppc_maybe_interrupt(env);
>                  pnv_cpu_do_nmi_resume(cs);
>              }
>              if (val & PPC_BIT(3 + 8 * i)) { /* clear maint */
>                  env->quiesced = false;
> +                ppc_maybe_interrupt(env);
>                  /*
>                   * Hardware has very particular cases for where clear 
> maint
>                   * must be used and where start must be used to resume 
> a
> diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
> index 0671d9e44b4..faf9170ba6b 100644
> --- a/hw/ppc/spapr_cpu_core.c
> +++ b/hw/ppc/spapr_cpu_core.c
> @@ -37,6 +37,9 @@ static void spapr_reset_vcpu(PowerPCCPU *cpu)
> 
>      cpu_reset(cs);
> 
> +    env->quiesced = true; /* set "RTAS stopped" state. */
> +    ppc_maybe_interrupt(env);
> +
>      /*
>       * "PowerPC Processor binding to IEEE 1275" defines the initial 
> MSR state
>       * as 32bit (MSR_SF=0) with MSR_ME=1 and MSR_FP=1 in "8.2.1. 
> Initial
> @@ -98,6 +101,9 @@ void spapr_cpu_set_entry_state(PowerPCCPU *cpu,
> target_ulong nip,
>      CPU(cpu)->halted = 0;
>      /* Enable Power-saving mode Exit Cause exceptions */
>      ppc_store_lpcr(cpu, env->spr[SPR_LPCR] | pcc->lpcr_pm);
> +
> +    env->quiesced = false; /* clear "RTAS stopped" state. */
> +    ppc_maybe_interrupt(env);
>  }
> 
>  /*
> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> index 503d441b48e..78309dbb09d 100644
> --- a/hw/ppc/spapr_rtas.c
> +++ b/hw/ppc/spapr_rtas.c
> @@ -110,7 +110,8 @@ static void rtas_query_cpu_stopped_state(PowerPCCPU 
> *cpu_,
>      id = rtas_ld(args, 0);
>      cpu = spapr_find_cpu(id);
>      if (cpu != NULL) {
> -        if (CPU(cpu)->halted) {
> +        CPUPPCState *env = &cpu->env;
> +        if (env->quiesced) {
>              rtas_st(rets, 1, 0);
>          } else {
>              rtas_st(rets, 1, 2);
> @@ -215,6 +216,8 @@ static void rtas_stop_self(PowerPCCPU *cpu,
> SpaprMachineState *spapr,
>       * For the same reason, set PSSCR_EC.
>       */
>      env->spr[SPR_PSSCR] |= PSSCR_EC;
> +    env->quiesced = true; /* set "RTAS stopped" state. */
> +    ppc_maybe_interrupt(env);
>      cs->halted = 1;
>      ppc_store_lpcr(cpu, env->spr[SPR_LPCR] & ~pcc->lpcr_pm);
>      kvmppc_set_reg_ppc_online(cpu, 0);
> diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
> index 44e19aacd8d..c941c89806e 100644
> --- a/target/ppc/excp_helper.c
> +++ b/target/ppc/excp_helper.c
> @@ -1951,6 +1951,10 @@ static int 
> ppc_next_unmasked_interrupt(CPUPPCState *env)
>      target_ulong lpcr = env->spr[SPR_LPCR];
>      bool async_deliver;
> 
> +    if (unlikely(env->quiesced)) {
> +        return 0;
> +    }
> +
>  #ifdef TARGET_PPC64
>      switch (env->excp_model) {
>      case POWERPC_EXCP_POWER7: