[Qemu-devel] [PATCH] ppc: Improve SMT experience with TCG accel

Jose Ricardo Ziviani posted 1 patch 4 years, 8 months ago
Test asan passed
Test docker-mingw@fedora passed
Test checkpatch passed
Test docker-clang@ubuntu failed
Test s390x passed
Test FreeBSD passed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20190716040253.23490-1-joserz@linux.ibm.com
Maintainers: David Gibson <david@gibson.dropbear.id.au>
hw/ppc/spapr.c           |  5 -----
target/ppc/excp_helper.c | 24 ++++++++++++++++++++++++
target/ppc/helper.h      |  1 +
target/ppc/translate.c   | 11 +++++++++++
4 files changed, 36 insertions(+), 5 deletions(-)
[Qemu-devel] [PATCH] ppc: Improve SMT experience with TCG accel
Posted by Jose Ricardo Ziviani 4 years, 8 months ago
It's not possible to specify the number of threads of a guest when
running QEMU/TCG. Today, users can have setups like:

... -accel tcg,thread=multi -smp 8,threads=1,cores=8 ...
or
... -accel tcg,thread=multi -smp 8,sockets=2,cores=4,threads=1 ...

However, the following is not possible:

... -accel tcg,thread=multi -smp 16,threads=4,cores=2,sockets=2 ...
qemu-system-ppc64: TCG cannot support more than 1 thread/core on a pseries machine

The reason is due to how SMT is implemented since Power8. This patch
implements a very basic simulation of the msgsndp instruction, using ext
interrupt instead of doorbells. The result is a better user experience,
allowing them to play with SMT modes. However, it doesn't relate with
MTTCG threads in any way.

Results:
... -accel tcg,thread=multi -smp 16,threads=4,cores=2,sockets=2 ...

root@ubuntu:~# ppc64_cpu --smt
SMT=4
root@ubuntu:~# ppc64_cpu --info
Core   0:    0*    1*    2*    3*
Core   1:    4*    5*    6*    7*
Core   2:    8*    9*   10*   11*
Core   3:   12*   13*   14*   15*
root@ubuntu:~# ppc64_cpu --smt=2
root@ubuntu:~# ppc64_cpu --info
Core   0:    0*    1*    2     3
Core   1:    4*    5*    6     7
Core   2:    8*    9*   10    11
Core   3:   12*   13*   14    15
root@ubuntu:~# ppc64_cpu --smt=off
root@ubuntu:~# ppc64_cpu --info
Core   0:    0*    1     2     3
Core   1:    4*    5     6     7
Core   2:    8*    9    10    11
Core   3:   12*   13    14    15

root@ubuntu:~# ppc64_cpu --smt
SMT is off
root@ubuntu:~# lscpu
Architecture:         ppc64le
Byte Order:           Little Endian
CPU(s):               16
On-line CPU(s) list:  0,4,8,12
Off-line CPU(s) list: 1-3,5-7,9-11,13-15
Thread(s) per core:   1
Core(s) per socket:   2
Socket(s):            2
NUMA node(s):         1
Model:                2.0 (pvr 004e 1200)
Model name:           POWER9 (architected), altivec supported
Hypervisor vendor:    KVM
Virtualization type:  para
L1d cache:            32K
L1i cache:            32K
NUMA node0 CPU(s):    0,4,8,12

root@ubuntu:~# ppc64_cpu --smt=4
root@ubuntu:~# lscpu
Architecture:        ppc64le
Byte Order:          Little Endian
CPU(s):              16
On-line CPU(s) list: 0-15
Thread(s) per core:  4
Core(s) per socket:  2
Socket(s):           2
NUMA node(s):        1
Model:               2.0 (pvr 004e 1200)
Model name:          POWER9 (architected), altivec supported
Hypervisor vendor:   KVM
Virtualization type: para
L1d cache:           32K
L1i cache:           32K
NUMA node0 CPU(s):   0-15

Note: it's also possible to simulate SMT in TCG single threaded mode.

Signed-off-by: Jose Ricardo Ziviani <joserz@linux.ibm.com>
---
 hw/ppc/spapr.c           |  5 -----
 target/ppc/excp_helper.c | 24 ++++++++++++++++++++++++
 target/ppc/helper.h      |  1 +
 target/ppc/translate.c   | 11 +++++++++++
 4 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 8783b43396..3a864dfc7d 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2572,11 +2572,6 @@ static void spapr_set_vsmt_mode(SpaprMachineState *spapr, Error **errp)
     int ret;
     unsigned int smp_threads = ms->smp.threads;
 
-    if (!kvm_enabled() && (smp_threads > 1)) {
-        error_setg(&local_err, "TCG cannot support more than 1 thread/core "
-                     "on a pseries machine");
-        goto out;
-    }
     if (!is_power_of_2(smp_threads)) {
         error_setg(&local_err, "Cannot support %d threads/core on a pseries "
                      "machine because it must be a power of 2", smp_threads);
diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 50b004d00d..ac5d196641 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1231,6 +1231,30 @@ static int book3s_dbell2irq(target_ulong rb)
     return msg == DBELL_TYPE_DBELL_SERVER ? PPC_INTERRUPT_HDOORBELL : -1;
 }
 
+void helper_msgsndp(target_ulong rb)
+{
+    CPUState *cs;
+    int irq = rb & DBELL_TYPE_MASK;
+    int thread_id = rb & 0x3f;
+
+    if (irq != DBELL_TYPE_DBELL_SERVER) {
+        return;
+    }
+
+    qemu_mutex_lock_iothread();
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+        if (cpu->vcpu_id == thread_id) {
+            continue;
+        }
+
+        cpu->env.pending_interrupts |= 1 << PPC_INTERRUPT_EXT;
+        cpu_interrupt(cs, CPU_INTERRUPT_HARD);
+    }
+    qemu_mutex_unlock_iothread();
+}
+
 void helper_book3s_msgclr(CPUPPCState *env, target_ulong rb)
 {
     int irq = book3s_dbell2irq(rb);
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 380c9b1e2a..eadd08324b 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -630,6 +630,7 @@ DEF_HELPER_FLAGS_3(store_sr, TCG_CALL_NO_RWG, void, env, tl, tl)
 
 DEF_HELPER_FLAGS_1(602_mfrom, TCG_CALL_NO_RWG_SE, tl, tl)
 DEF_HELPER_1(msgsnd, void, tl)
+DEF_HELPER_1(msgsndp, void, tl)
 DEF_HELPER_2(msgclr, void, env, tl)
 DEF_HELPER_1(book3s_msgsnd, void, tl)
 DEF_HELPER_2(book3s_msgclr, void, env, tl)
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 4a5de28036..083731292b 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -6657,6 +6657,15 @@ static void gen_msgsnd(DisasContext *ctx)
 #endif /* defined(CONFIG_USER_ONLY) */
 }
 
+static void gen_msgsndp(DisasContext *ctx)
+{
+#if defined(CONFIG_USER_ONLY)
+    GEN_PRIV;
+#else
+    gen_helper_msgsndp(cpu_gpr[rB(ctx->opcode)]);
+#endif /* defined(CONFIG_USER_ONLY) */
+}
+
 static void gen_msgsync(DisasContext *ctx)
 {
 #if defined(CONFIG_USER_ONLY)
@@ -7176,6 +7185,8 @@ GEN_HANDLER2_E(tlbilx_booke206, "tlbilx", 0x1F, 0x12, 0x00, 0x03800001,
                PPC_NONE, PPC2_BOOKE206),
 GEN_HANDLER2_E(msgsnd, "msgsnd", 0x1F, 0x0E, 0x06, 0x03ff0001,
                PPC_NONE, PPC2_PRCNTL),
+GEN_HANDLER2_E(msgsndp, "msgsndp", 0x1F, 0x0E, 0x04, 0x03ff0001,
+               PPC_NONE, PPC_POWER),
 GEN_HANDLER2_E(msgclr, "msgclr", 0x1F, 0x0E, 0x07, 0x03ff0001,
                PPC_NONE, PPC2_PRCNTL),
 GEN_HANDLER2_E(msgsync, "msgsync", 0x1F, 0x16, 0x1B, 0x00000000,
-- 
2.21.0


Re: [Qemu-devel] [PATCH] ppc: Improve SMT experience with TCG accel
Posted by David Gibson 4 years, 8 months ago
On Tue, Jul 16, 2019 at 01:02:53AM -0300, Jose Ricardo Ziviani wrote:
> It's not possible to specify the number of threads of a guest when
> running QEMU/TCG. Today, users can have setups like:
> 
> ... -accel tcg,thread=multi -smp 8,threads=1,cores=8 ...
> or
> ... -accel tcg,thread=multi -smp 8,sockets=2,cores=4,threads=1 ...
> 
> However, the following is not possible:
> 
> ... -accel tcg,thread=multi -smp 16,threads=4,cores=2,sockets=2 ...
> qemu-system-ppc64: TCG cannot support more than 1 thread/core on a pseries machine
> 
> The reason is due to how SMT is implemented since Power8. This patch
> implements a very basic simulation of the msgsndp instruction, using ext
> interrupt instead of doorbells. The result is a better user experience,
> allowing them to play with SMT modes. However, it doesn't relate with
> MTTCG threads in any way.

This really isn't enough.

POWER also has a number of SPRs which are per-core rather than
per-thread, but currently TCG treats everything as per-thread.  You'd
need to properly implement per-core registers before you can advertise
support for multiple threads in TCG.

> 
> Results:
> ... -accel tcg,thread=multi -smp 16,threads=4,cores=2,sockets=2 ...
> 
> root@ubuntu:~# ppc64_cpu --smt
> SMT=4
> root@ubuntu:~# ppc64_cpu --info
> Core   0:    0*    1*    2*    3*
> Core   1:    4*    5*    6*    7*
> Core   2:    8*    9*   10*   11*
> Core   3:   12*   13*   14*   15*
> root@ubuntu:~# ppc64_cpu --smt=2
> root@ubuntu:~# ppc64_cpu --info
> Core   0:    0*    1*    2     3
> Core   1:    4*    5*    6     7
> Core   2:    8*    9*   10    11
> Core   3:   12*   13*   14    15
> root@ubuntu:~# ppc64_cpu --smt=off
> root@ubuntu:~# ppc64_cpu --info
> Core   0:    0*    1     2     3
> Core   1:    4*    5     6     7
> Core   2:    8*    9    10    11
> Core   3:   12*   13    14    15
> 
> root@ubuntu:~# ppc64_cpu --smt
> SMT is off
> root@ubuntu:~# lscpu
> Architecture:         ppc64le
> Byte Order:           Little Endian
> CPU(s):               16
> On-line CPU(s) list:  0,4,8,12
> Off-line CPU(s) list: 1-3,5-7,9-11,13-15
> Thread(s) per core:   1
> Core(s) per socket:   2
> Socket(s):            2
> NUMA node(s):         1
> Model:                2.0 (pvr 004e 1200)
> Model name:           POWER9 (architected), altivec supported
> Hypervisor vendor:    KVM
> Virtualization type:  para
> L1d cache:            32K
> L1i cache:            32K
> NUMA node0 CPU(s):    0,4,8,12
> 
> root@ubuntu:~# ppc64_cpu --smt=4
> root@ubuntu:~# lscpu
> Architecture:        ppc64le
> Byte Order:          Little Endian
> CPU(s):              16
> On-line CPU(s) list: 0-15
> Thread(s) per core:  4
> Core(s) per socket:  2
> Socket(s):           2
> NUMA node(s):        1
> Model:               2.0 (pvr 004e 1200)
> Model name:          POWER9 (architected), altivec supported
> Hypervisor vendor:   KVM
> Virtualization type: para
> L1d cache:           32K
> L1i cache:           32K
> NUMA node0 CPU(s):   0-15
> 
> Note: it's also possible to simulate SMT in TCG single threaded mode.
> 
> Signed-off-by: Jose Ricardo Ziviani <joserz@linux.ibm.com>
> ---
>  hw/ppc/spapr.c           |  5 -----
>  target/ppc/excp_helper.c | 24 ++++++++++++++++++++++++
>  target/ppc/helper.h      |  1 +
>  target/ppc/translate.c   | 11 +++++++++++
>  4 files changed, 36 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 8783b43396..3a864dfc7d 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2572,11 +2572,6 @@ static void spapr_set_vsmt_mode(SpaprMachineState *spapr, Error **errp)
>      int ret;
>      unsigned int smp_threads = ms->smp.threads;
>  
> -    if (!kvm_enabled() && (smp_threads > 1)) {
> -        error_setg(&local_err, "TCG cannot support more than 1 thread/core "
> -                     "on a pseries machine");
> -        goto out;
> -    }
>      if (!is_power_of_2(smp_threads)) {
>          error_setg(&local_err, "Cannot support %d threads/core on a pseries "
>                       "machine because it must be a power of 2", smp_threads);
> diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
> index 50b004d00d..ac5d196641 100644
> --- a/target/ppc/excp_helper.c
> +++ b/target/ppc/excp_helper.c
> @@ -1231,6 +1231,30 @@ static int book3s_dbell2irq(target_ulong rb)
>      return msg == DBELL_TYPE_DBELL_SERVER ? PPC_INTERRUPT_HDOORBELL : -1;
>  }
>  
> +void helper_msgsndp(target_ulong rb)
> +{
> +    CPUState *cs;
> +    int irq = rb & DBELL_TYPE_MASK;
> +    int thread_id = rb & 0x3f;
> +
> +    if (irq != DBELL_TYPE_DBELL_SERVER) {
> +        return;
> +    }
> +
> +    qemu_mutex_lock_iothread();
> +    CPU_FOREACH(cs) {
> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> +
> +        if (cpu->vcpu_id == thread_id) {
> +            continue;
> +        }
> +
> +        cpu->env.pending_interrupts |= 1 << PPC_INTERRUPT_EXT;
> +        cpu_interrupt(cs, CPU_INTERRUPT_HARD);
> +    }
> +    qemu_mutex_unlock_iothread();
> +}
> +
>  void helper_book3s_msgclr(CPUPPCState *env, target_ulong rb)
>  {
>      int irq = book3s_dbell2irq(rb);
> diff --git a/target/ppc/helper.h b/target/ppc/helper.h
> index 380c9b1e2a..eadd08324b 100644
> --- a/target/ppc/helper.h
> +++ b/target/ppc/helper.h
> @@ -630,6 +630,7 @@ DEF_HELPER_FLAGS_3(store_sr, TCG_CALL_NO_RWG, void, env, tl, tl)
>  
>  DEF_HELPER_FLAGS_1(602_mfrom, TCG_CALL_NO_RWG_SE, tl, tl)
>  DEF_HELPER_1(msgsnd, void, tl)
> +DEF_HELPER_1(msgsndp, void, tl)
>  DEF_HELPER_2(msgclr, void, env, tl)
>  DEF_HELPER_1(book3s_msgsnd, void, tl)
>  DEF_HELPER_2(book3s_msgclr, void, env, tl)
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index 4a5de28036..083731292b 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -6657,6 +6657,15 @@ static void gen_msgsnd(DisasContext *ctx)
>  #endif /* defined(CONFIG_USER_ONLY) */
>  }
>  
> +static void gen_msgsndp(DisasContext *ctx)
> +{
> +#if defined(CONFIG_USER_ONLY)
> +    GEN_PRIV;
> +#else
> +    gen_helper_msgsndp(cpu_gpr[rB(ctx->opcode)]);
> +#endif /* defined(CONFIG_USER_ONLY) */
> +}
> +
>  static void gen_msgsync(DisasContext *ctx)
>  {
>  #if defined(CONFIG_USER_ONLY)
> @@ -7176,6 +7185,8 @@ GEN_HANDLER2_E(tlbilx_booke206, "tlbilx", 0x1F, 0x12, 0x00, 0x03800001,
>                 PPC_NONE, PPC2_BOOKE206),
>  GEN_HANDLER2_E(msgsnd, "msgsnd", 0x1F, 0x0E, 0x06, 0x03ff0001,
>                 PPC_NONE, PPC2_PRCNTL),
> +GEN_HANDLER2_E(msgsndp, "msgsndp", 0x1F, 0x0E, 0x04, 0x03ff0001,
> +               PPC_NONE, PPC_POWER),
>  GEN_HANDLER2_E(msgclr, "msgclr", 0x1F, 0x0E, 0x07, 0x03ff0001,
>                 PPC_NONE, PPC2_PRCNTL),
>  GEN_HANDLER2_E(msgsync, "msgsync", 0x1F, 0x16, 0x1B, 0x00000000,

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson