Series comparison

-[PULL 0/3] tcg patch queue
+[PATCH 00/27] tcg patch queue
-The following changes since commit 2ecfc0657afa5d29a373271b342f704a1a3c6737:
+Pulling together some cleanups, fixes, and prepatory tci stuff.
 Most of this has been reviewed, but not all.
-  Merge remote-tracking branch 'remotes/armbru/tags/pull-misc-2020-12-10' into staging (2020-12-10 17:01:05 +0000)
+Those lacking review:
-are available in the Git repository at:
+-tcg-aarch64-Fix-constant-subtraction-in-tcg_out_adds.patch
 -tcg-aarch64-Fix-I3617_CMLE0.patch
 -tcg-aarch64-Fix-generation-of-scalar-vector-operatio.patch
 -tcg-tci-Use-exec-cpu_ldst.h-interfaces.patch
 -tcg-Manage-splitwx-in-tc_ptr_to_region_tree-by-hand.patch
 -accel-tcg-rename-tb_lookup__cpu_state-and-hoist-stat.patch
 -accel-tcg-move-CF_CLUSTER-calculation-to-curr_cflags.patch
 -accel-tcg-drop-the-use-of-CF_HASH_MASK-and-rename-pa.patch
 -include-exec-lightly-re-arrange-TranslationBlock.patch
 -accel-tcg-Precompute-curr_cflags-into-cpu-tcg_cflags.patch
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20201210
+Alex, the last patch is a re-write and extension of one that
 you did review.
-for you to fetch changes up to 9e2658d62ebc23efe7df43fc0e306f129510d874:
-  accel/tcg: rename tcg-cpus functions to match module name (2020-12-10 17:44:10 -0600)
+r~
-----------------------------------------------------------------
-Split CpusAccel for tcg variants
-----------------------------------------------------------------
+Alex Bennée (4):
-Claudio Fontana (3):
+  accel/tcg: rename tb_lookup__cpu_state and hoist state extraction
-      accel/tcg: split CpusAccel into three TCG variants
+  accel/tcg: move CF_CLUSTER calculation to curr_cflags
-      accel/tcg: split tcg_start_vcpu_thread
+  accel/tcg: drop the use of CF_HASH_MASK and rename params
-      accel/tcg: rename tcg-cpus functions to match module name
+  include/exec: lightly re-arrange TranslationBlock
- accel/tcg/tcg-cpus-icount.h |  17 ++
+Richard Henderson (23):
- accel/tcg/tcg-cpus-rr.h     |  21 ++
+  tcg/aarch64: Fix constant subtraction in tcg_out_addsub2
- accel/tcg/tcg-cpus.h        |  12 +-
+  tcg/aarch64: Fix I3617_CMLE0
- accel/tcg/tcg-all.c         |  13 +-
+  tcg/aarch64: Fix generation of "scalar" vector operations
- accel/tcg/tcg-cpus-icount.c | 147 +++++++++++++
+  tcg/tci: Use exec/cpu_ldst.h interfaces
- accel/tcg/tcg-cpus-mttcg.c  | 140 ++++++++++++
+  tcg: Split out tcg_raise_tb_overflow
- accel/tcg/tcg-cpus-rr.c     | 305 ++++++++++++++++++++++++++
+  tcg: Manage splitwx in tc_ptr_to_region_tree by hand
- accel/tcg/tcg-cpus.c        | 506 +-------------------------------------------
+  tcg/tci: Merge identical cases in generation (arithmetic opcodes)
- softmmu/icount.c            |   2 +-
+  tcg/tci: Merge identical cases in generation (exchange opcodes)
- accel/tcg/meson.build       |   9 +-
+  tcg/tci: Merge identical cases in generation (deposit opcode)
-files changed, 670 insertions(+), 502 deletions(-)
+  tcg/tci: Merge identical cases in generation (conditional opcodes)
- create mode 100644 accel/tcg/tcg-cpus-icount.h
+  tcg/tci: Merge identical cases in generation (load/store opcodes)
- create mode 100644 accel/tcg/tcg-cpus-rr.h
+  tcg/tci: Remove tci_read_r8
- create mode 100644 accel/tcg/tcg-cpus-icount.c
+  tcg/tci: Remove tci_read_r8s
- create mode 100644 accel/tcg/tcg-cpus-mttcg.c
+  tcg/tci: Remove tci_read_r16
- create mode 100644 accel/tcg/tcg-cpus-rr.c
+  tcg/tci: Remove tci_read_r16s
   tcg/tci: Remove tci_read_r32
   tcg/tci: Remove tci_read_r32s
   tcg/tci: Reduce use of tci_read_r64
   tcg/tci: Merge basic arithmetic operations
   tcg/tci: Merge extension operations
   tcg/tci: Merge bswap operations
   tcg/tci: Merge mov, not and neg operations
   accel/tcg: Precompute curr_cflags into cpu->tcg_cflags
+ accel/tcg/tcg-accel-ops.h       |   1 +
+ include/exec/exec-all.h         |  19 +-
+ include/exec/tb-lookup.h        |  26 +-
+ include/hw/core/cpu.h           |   2 +
+ accel/tcg/cpu-exec.c            |  34 ++-
+ accel/tcg/tcg-accel-ops-mttcg.c |   3 +-
+ accel/tcg/tcg-accel-ops-rr.c    |   2 +-
+ accel/tcg/tcg-accel-ops.c       |   8 +
+ accel/tcg/tcg-runtime.c         |   6 +-
+ accel/tcg/translate-all.c       |  18 +-
+ linux-user/main.c               |   1 +
+ linux-user/sh4/signal.c         |   8 +-
+ linux-user/syscall.c            |  18 +-
+ softmmu/physmem.c               |   2 +-
+ tcg/tcg.c                       |  29 +-
+ tcg/tci.c                       | 526 ++++++++++----------------------
+ tcg/aarch64/tcg-target.c.inc    | 229 +++++++++++---
+ tcg/tci/tcg-target.c.inc        | 204 +++++--------
+files changed, 526 insertions(+), 610 deletions(-)
+--
+.25.1

-New patch
+[PATCH 01/27] tcg/aarch64: Fix constant subtraction in tcg_out_addsub2
+An hppa guest executing
+x000000000000e05c:  ldil L%10000,r4
+x000000000000e060:  ldo 0(r4),r4
+x000000000000e064:  sub r3,r4,sp
+produces
+ ---- 000000000000e064 000000000000e068
+ sub2_i32 tmp0,tmp4,r3,$0x1,$0x10000,$0x0
+after folding and constant propagation.  Then we hit
+tcg-target.c.inc:640: tcg_out_insn_3401: Assertion `aimm <= 0xfff' failed.
+because aimm is in fact -16, but unsigned.
+The ((bl < 0) ^ sub) condition which negates bl is incorrect and will
+always lead to this abort.  If the constant is positive, sub will make
+it negative; if the constant is negative, sub will keep it negative.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/aarch64/tcg-target.c.inc | 16 +++++++++-------
+file changed, 9 insertions(+), 7 deletions(-)
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/aarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_addsubi(TCGContext *s, int ext, TCGReg rd,
+     }
+ }
+-static inline void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
+-                                   TCGReg rh, TCGReg al, TCGReg ah,
+-                                   tcg_target_long bl, tcg_target_long bh,
+-                                   bool const_bl, bool const_bh, bool sub)
++static void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
++                            TCGReg rh, TCGReg al, TCGReg ah,
++                            tcg_target_long bl, tcg_target_long bh,
++                            bool const_bl, bool const_bh, bool sub)
+ {
+     TCGReg orig_rl = rl;
+     AArch64Insn insn;
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
+     }
+     if (const_bl) {
+-        insn = I3401_ADDSI;
+-        if ((bl < 0) ^ sub) {
+-            insn = I3401_SUBSI;
++        if (bl < 0) {
+             bl = -bl;
++            insn = sub ? I3401_ADDSI : I3401_SUBSI;
++        } else {
++            insn = sub ? I3401_SUBSI : I3401_ADDSI;
+         }
++
+         if (unlikely(al == TCG_REG_XZR)) {
+             /* ??? We want to allow al to be zero for the benefit of
+                negation via subtraction.  However, that leaves open the
+--
+.25.1

-New patch
+[PATCH 02/27] tcg/aarch64: Fix I3617_CMLE0
+Fix a typo in the encodeing of the cmle (zero) instruction.
+Fixes: 14e4c1e2355 ("tcg/aarch64: Add vector operations")
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/aarch64/tcg-target.c.inc | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/aarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     I3617_CMEQ0     = 0x0e209800,
+     I3617_CMLT0     = 0x0e20a800,
+     I3617_CMGE0     = 0x2e208800,
+-    I3617_CMLE0     = 0x2e20a800,
++    I3617_CMLE0     = 0x2e209800,
+     I3617_NOT       = 0x2e205800,
+     I3617_ABS       = 0x0e20b800,
+     I3617_NEG       = 0x2e20b800,
+--
+.25.1

-New patch
+[PATCH 03/27] tcg/aarch64: Fix generation of "scalar" vector operations
+For some vector operations, "1D" is not a valid type, and there
+are separate instructions for the 64-bit scalar operation.
+Tested-by: Stefan Weil <sw@weilnetz.de>
+Buglink: https://bugs.launchpad.net/qemu/+bug/1916112
+Fixes: 14e4c1e2355 ("tcg/aarch64: Add vector operations")
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/aarch64/tcg-target.c.inc | 211 ++++++++++++++++++++++++++++++-----
+file changed, 181 insertions(+), 30 deletions(-)
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/aarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     I3606_BIC       = 0x2f001400,
+     I3606_ORR       = 0x0f001400,
++    /* AdvSIMD scalar shift by immediate */
++    I3609_SSHR      = 0x5f000400,
++    I3609_SSRA      = 0x5f001400,
++    I3609_SHL       = 0x5f005400,
++    I3609_USHR      = 0x7f000400,
++    I3609_USRA      = 0x7f001400,
++    I3609_SLI       = 0x7f005400,
++
++    /* AdvSIMD scalar three same */
++    I3611_SQADD     = 0x5e200c00,
++    I3611_SQSUB     = 0x5e202c00,
++    I3611_CMGT      = 0x5e203400,
++    I3611_CMGE      = 0x5e203c00,
++    I3611_SSHL      = 0x5e204400,
++    I3611_ADD       = 0x5e208400,
++    I3611_CMTST     = 0x5e208c00,
++    I3611_UQADD     = 0x7e200c00,
++    I3611_UQSUB     = 0x7e202c00,
++    I3611_CMHI      = 0x7e203400,
++    I3611_CMHS      = 0x7e203c00,
++    I3611_USHL      = 0x7e204400,
++    I3611_SUB       = 0x7e208400,
++    I3611_CMEQ      = 0x7e208c00,
++
++    /* AdvSIMD scalar two-reg misc */
++    I3612_CMGT0     = 0x5e208800,
++    I3612_CMEQ0     = 0x5e209800,
++    I3612_CMLT0     = 0x5e20a800,
++    I3612_ABS       = 0x5e20b800,
++    I3612_CMGE0     = 0x7e208800,
++    I3612_CMLE0     = 0x7e209800,
++    I3612_NEG       = 0x7e20b800,
++
+     /* AdvSIMD shift by immediate */
+     I3614_SSHR      = 0x0f000400,
+     I3614_SSRA      = 0x0f001400,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_insn_3606(TCGContext *s, AArch64Insn insn, bool q,
+               | (imm8 & 0xe0) << (16 - 5) | (imm8 & 0x1f) << 5);
+ }
++static void tcg_out_insn_3609(TCGContext *s, AArch64Insn insn,
++                              TCGReg rd, TCGReg rn, unsigned immhb)
++{
++    tcg_out32(s, insn | immhb << 16 | (rn & 0x1f) << 5 | (rd & 0x1f));
++}
++
++static void tcg_out_insn_3611(TCGContext *s, AArch64Insn insn,
++                              unsigned size, TCGReg rd, TCGReg rn, TCGReg rm)
++{
++    tcg_out32(s, insn | (size << 22) | (rm & 0x1f) << 16
++              | (rn & 0x1f) << 5 | (rd & 0x1f));
++}
++
++static void tcg_out_insn_3612(TCGContext *s, AArch64Insn insn,
++                              unsigned size, TCGReg rd, TCGReg rn)
++{
++    tcg_out32(s, insn | (size << 22) | (rn & 0x1f) << 5 | (rd & 0x1f));
++}
++
+ static void tcg_out_insn_3614(TCGContext *s, AArch64Insn insn, bool q,
+                               TCGReg rd, TCGReg rn, unsigned immhb)
+ {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+                            unsigned vecl, unsigned vece,
+                            const TCGArg *args, const int *const_args)
+ {
+-    static const AArch64Insn cmp_insn[16] = {
++    static const AArch64Insn cmp_vec_insn[16] = {
+         [TCG_COND_EQ] = I3616_CMEQ,
+         [TCG_COND_GT] = I3616_CMGT,
+         [TCG_COND_GE] = I3616_CMGE,
+         [TCG_COND_GTU] = I3616_CMHI,
+         [TCG_COND_GEU] = I3616_CMHS,
+     };
+-    static const AArch64Insn cmp0_insn[16] = {
++    static const AArch64Insn cmp_scalar_insn[16] = {
++        [TCG_COND_EQ] = I3611_CMEQ,
++        [TCG_COND_GT] = I3611_CMGT,
++        [TCG_COND_GE] = I3611_CMGE,
++        [TCG_COND_GTU] = I3611_CMHI,
++        [TCG_COND_GEU] = I3611_CMHS,
++    };
++    static const AArch64Insn cmp0_vec_insn[16] = {
+         [TCG_COND_EQ] = I3617_CMEQ0,
+         [TCG_COND_GT] = I3617_CMGT0,
+         [TCG_COND_GE] = I3617_CMGE0,
+         [TCG_COND_LT] = I3617_CMLT0,
+         [TCG_COND_LE] = I3617_CMLE0,
+     };
++    static const AArch64Insn cmp0_scalar_insn[16] = {
++        [TCG_COND_EQ] = I3612_CMEQ0,
++        [TCG_COND_GT] = I3612_CMGT0,
++        [TCG_COND_GE] = I3612_CMGE0,
++        [TCG_COND_LT] = I3612_CMLT0,
++        [TCG_COND_LE] = I3612_CMLE0,
++    };
+     TCGType type = vecl + TCG_TYPE_V64;
+     unsigned is_q = vecl;
++    bool is_scalar = !is_q && vece == MO_64;
+     TCGArg a0, a1, a2, a3;
+     int cmode, imm8;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
+         break;
+     case INDEX_op_add_vec:
+-        tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
++        if (is_scalar) {
++            tcg_out_insn(s, 3611, ADD, vece, a0, a1, a2);
++        } else {
++            tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
++        }
+         break;
+     case INDEX_op_sub_vec:
+-        tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2);
++        if (is_scalar) {
++            tcg_out_insn(s, 3611, SUB, vece, a0, a1, a2);
++        } else {
++            tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2);
++        }
+         break;
+     case INDEX_op_mul_vec:
+         tcg_out_insn(s, 3616, MUL, is_q, vece, a0, a1, a2);
+         break;
+     case INDEX_op_neg_vec:
+-        tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
++        if (is_scalar) {
++            tcg_out_insn(s, 3612, NEG, vece, a0, a1);
++        } else {
++            tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
++        }
+         break;
+     case INDEX_op_abs_vec:
+-        tcg_out_insn(s, 3617, ABS, is_q, vece, a0, a1);
++        if (is_scalar) {
++            tcg_out_insn(s, 3612, ABS, vece, a0, a1);
++        } else {
++            tcg_out_insn(s, 3617, ABS, is_q, vece, a0, a1);
++        }
+         break;
+     case INDEX_op_and_vec:
+         if (const_args[2]) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+         tcg_out_insn(s, 3616, EOR, is_q, 0, a0, a1, a2);
+         break;
+     case INDEX_op_ssadd_vec:
+-        tcg_out_insn(s, 3616, SQADD, is_q, vece, a0, a1, a2);
++        if (is_scalar) {
++            tcg_out_insn(s, 3611, SQADD, vece, a0, a1, a2);
++        } else {
++            tcg_out_insn(s, 3616, SQADD, is_q, vece, a0, a1, a2);
++        }
+         break;
+     case INDEX_op_sssub_vec:
+-        tcg_out_insn(s, 3616, SQSUB, is_q, vece, a0, a1, a2);
++        if (is_scalar) {
++            tcg_out_insn(s, 3611, SQSUB, vece, a0, a1, a2);
++        } else {
++            tcg_out_insn(s, 3616, SQSUB, is_q, vece, a0, a1, a2);
++        }
+         break;
+     case INDEX_op_usadd_vec:
+-        tcg_out_insn(s, 3616, UQADD, is_q, vece, a0, a1, a2);
++        if (is_scalar) {
++            tcg_out_insn(s, 3611, UQADD, vece, a0, a1, a2);
++        } else {
++            tcg_out_insn(s, 3616, UQADD, is_q, vece, a0, a1, a2);
++        }
+         break;
+     case INDEX_op_ussub_vec:
+-        tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
++        if (is_scalar) {
++            tcg_out_insn(s, 3611, UQSUB, vece, a0, a1, a2);
++        } else {
++            tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
++        }
+         break;
+     case INDEX_op_smax_vec:
+         tcg_out_insn(s, 3616, SMAX, is_q, vece, a0, a1, a2);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+         tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1);
+         break;
+     case INDEX_op_shli_vec:
+-        tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
++        if (is_scalar) {
++            tcg_out_insn(s, 3609, SHL, a0, a1, a2 + (8 << vece));
++        } else {
++            tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
++        }
+         break;
+     case INDEX_op_shri_vec:
+-        tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2);
++        if (is_scalar) {
++            tcg_out_insn(s, 3609, USHR, a0, a1, (16 << vece) - a2);
++        } else {
++            tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2);
++        }
+         break;
+     case INDEX_op_sari_vec:
+-        tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
++        if (is_scalar) {
++            tcg_out_insn(s, 3609, SSHR, a0, a1, (16 << vece) - a2);
++        } else {
++            tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
++        }
+         break;
+     case INDEX_op_aa64_sli_vec:
+-        tcg_out_insn(s, 3614, SLI, is_q, a0, a2, args[3] + (8 << vece));
++        if (is_scalar) {
++            tcg_out_insn(s, 3609, SLI, a0, a2, args[3] + (8 << vece));
++        } else {
++            tcg_out_insn(s, 3614, SLI, is_q, a0, a2, args[3] + (8 << vece));
++        }
+         break;
+     case INDEX_op_shlv_vec:
+-        tcg_out_insn(s, 3616, USHL, is_q, vece, a0, a1, a2);
++        if (is_scalar) {
++            tcg_out_insn(s, 3611, USHL, vece, a0, a1, a2);
++        } else {
++            tcg_out_insn(s, 3616, USHL, is_q, vece, a0, a1, a2);
++        }
+         break;
+     case INDEX_op_aa64_sshl_vec:
+-        tcg_out_insn(s, 3616, SSHL, is_q, vece, a0, a1, a2);
++        if (is_scalar) {
++            tcg_out_insn(s, 3611, SSHL, vece, a0, a1, a2);
++        } else {
++            tcg_out_insn(s, 3616, SSHL, is_q, vece, a0, a1, a2);
++        }
+         break;
+     case INDEX_op_cmp_vec:
+         {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+             if (cond == TCG_COND_NE) {
+                 if (const_args[2]) {
+-                    tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1);
++                    if (is_scalar) {
++                        tcg_out_insn(s, 3611, CMTST, vece, a0, a1, a1);
++                    } else {
++                        tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1);
++                    }
+                 } else {
+-                    tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2);
++                    if (is_scalar) {
++                        tcg_out_insn(s, 3611, CMEQ, vece, a0, a1, a2);
++                    } else {
++                        tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2);
++                    }
+                     tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a0);
+                 }
+             } else {
+                 if (const_args[2]) {
+-                    insn = cmp0_insn[cond];
+-                    if (insn) {
+-                        tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
+-                        break;
++                    if (is_scalar) {
++                        insn = cmp0_scalar_insn[cond];
++                        if (insn) {
++                            tcg_out_insn_3612(s, insn, vece, a0, a1);
++                            break;
++                        }
++                    } else {
++                        insn = cmp0_vec_insn[cond];
++                        if (insn) {
++                            tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
++                            break;
++                        }
+                     }
+                     tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
+                     a2 = TCG_VEC_TMP;
+                 }
+-                insn = cmp_insn[cond];
+-                if (insn == 0) {
+-                    TCGArg t;
+-                    t = a1, a1 = a2, a2 = t;
+-                    cond = tcg_swap_cond(cond);
+-                    insn = cmp_insn[cond];
+-                    tcg_debug_assert(insn != 0);
++                if (is_scalar) {
++                    insn = cmp_scalar_insn[cond];
++                    if (insn == 0) {
++                        TCGArg t;
++                        t = a1, a1 = a2, a2 = t;
++                        cond = tcg_swap_cond(cond);
++                        insn = cmp_scalar_insn[cond];
++                        tcg_debug_assert(insn != 0);
++                    }
++                    tcg_out_insn_3611(s, insn, vece, a0, a1, a2);
++                } else {
++                    insn = cmp_vec_insn[cond];
++                    if (insn == 0) {
++                        TCGArg t;
++                        t = a1, a1 = a2, a2 = t;
++                        cond = tcg_swap_cond(cond);
++                        insn = cmp_vec_insn[cond];
++                        tcg_debug_assert(insn != 0);
++                    }
++                    tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2);
+                 }
+-                tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2);
+             }
+         }
+         break;
+--
+.25.1

-New patch
+[PATCH 04/27] tcg/tci: Use exec/cpu_ldst.h interfaces
+Use the provided cpu_ldst.h interfaces.  This fixes the build vs
+the unconverted uses of g2h(), adds missed memory trace events,
+and correctly recognizes when a SIGSEGV belongs to the guest via
+set_helper_retaddr().
+Fixes: 3e8f1628e864
+Tested-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci.c | 73 +++++++++++++++++++++----------------------------------
+file changed, 28 insertions(+), 45 deletions(-)
+diff --git a/tcg/tci.c b/tcg/tci.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci.c
++++ b/tcg/tci.c
+@@ -XXX,XX +XXX,XX @@ static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition)
+     return result;
+ }
+-#ifdef CONFIG_SOFTMMU
+-# define qemu_ld_ub \
+-    helper_ret_ldub_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
+-# define qemu_ld_leuw \
+-    helper_le_lduw_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
+-# define qemu_ld_leul \
+-    helper_le_ldul_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
+-# define qemu_ld_leq \
+-    helper_le_ldq_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
+-# define qemu_ld_beuw \
+-    helper_be_lduw_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
+-# define qemu_ld_beul \
+-    helper_be_ldul_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
+-# define qemu_ld_beq \
+-    helper_be_ldq_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
+-# define qemu_st_b(X) \
+-    helper_ret_stb_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
+-# define qemu_st_lew(X) \
+-    helper_le_stw_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
+-# define qemu_st_lel(X) \
+-    helper_le_stl_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
+-# define qemu_st_leq(X) \
+-    helper_le_stq_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
+-# define qemu_st_bew(X) \
+-    helper_be_stw_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
+-# define qemu_st_bel(X) \
+-    helper_be_stl_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
+-# define qemu_st_beq(X) \
+-    helper_be_stq_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
+-#else
+-# define qemu_ld_ub      ldub_p(g2h(taddr))
+-# define qemu_ld_leuw    lduw_le_p(g2h(taddr))
+-# define qemu_ld_leul    (uint32_t)ldl_le_p(g2h(taddr))
+-# define qemu_ld_leq     ldq_le_p(g2h(taddr))
+-# define qemu_ld_beuw    lduw_be_p(g2h(taddr))
+-# define qemu_ld_beul    (uint32_t)ldl_be_p(g2h(taddr))
+-# define qemu_ld_beq     ldq_be_p(g2h(taddr))
+-# define qemu_st_b(X)    stb_p(g2h(taddr), X)
+-# define qemu_st_lew(X)  stw_le_p(g2h(taddr), X)
+-# define qemu_st_lel(X)  stl_le_p(g2h(taddr), X)
+-# define qemu_st_leq(X)  stq_le_p(g2h(taddr), X)
+-# define qemu_st_bew(X)  stw_be_p(g2h(taddr), X)
+-# define qemu_st_bel(X)  stl_be_p(g2h(taddr), X)
+-# define qemu_st_beq(X)  stq_be_p(g2h(taddr), X)
+-#endif
++#define qemu_ld_ub \
++    cpu_ldub_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
++#define qemu_ld_leuw \
++    cpu_lduw_le_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
++#define qemu_ld_leul \
++    cpu_ldl_le_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
++#define qemu_ld_leq \
++    cpu_ldq_le_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
++#define qemu_ld_beuw \
++    cpu_lduw_be_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
++#define qemu_ld_beul \
++    cpu_ldl_be_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
++#define qemu_ld_beq \
++    cpu_ldq_be_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
++#define qemu_st_b(X) \
++    cpu_stb_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
++#define qemu_st_lew(X) \
++    cpu_stw_le_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
++#define qemu_st_lel(X) \
++    cpu_stl_le_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
++#define qemu_st_leq(X) \
++    cpu_stq_le_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
++#define qemu_st_bew(X) \
++    cpu_stw_be_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
++#define qemu_st_bel(X) \
++    cpu_stl_be_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
++#define qemu_st_beq(X) \
++    cpu_stq_be_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
+ #if TCG_TARGET_REG_BITS == 64
+ # define CASE_32_64(x) \
+--
+.25.1

-New patch
+[PATCH 05/27] tcg: Split out tcg_raise_tb_overflow
+Allow other places in tcg to restart with a smaller tb.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tcg.c | 9 +++++++--
+file changed, 7 insertions(+), 2 deletions(-)
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
+     s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
+ }
++/* Signal overflow, starting over with fewer guest insns. */
++static void QEMU_NORETURN tcg_raise_tb_overflow(TCGContext *s)
++{
++    siglongjmp(s->jmp_trans, -2);
++}
++
+ #define C_PFX1(P, A)                    P##A
+ #define C_PFX2(P, A, B)                 P##A##_##B
+ #define C_PFX3(P, A, B, C)              P##A##_##B##_##C
+@@ -XXX,XX +XXX,XX @@ static TCGTemp *tcg_temp_alloc(TCGContext *s)
+     int n = s->nb_temps++;
+     if (n >= TCG_MAX_TEMPS) {
+-        /* Signal overflow, starting over with fewer guest insns. */
+-        siglongjmp(s->jmp_trans, -2);
++        tcg_raise_tb_overflow(s);
+     }
+     return memset(&s->temps[n], 0, sizeof(TCGTemp));
+ }
+--
+.25.1

-[PULL 1/3] accel/tcg: split CpusAccel into three TCG variants
+[PATCH 06/27] tcg: Manage splitwx in tc_ptr_to_region_tree by hand
-From: Claudio Fontana <cfontana@suse.de>
+The use in tcg_tb_lookup is given a random pc that comes from the pc
 of a signal handler.  Do not assert that the pointer is already within
 the code gen buffer at all, much less the writable mirror of it.
-split up the CpusAccel tcg_cpus into three TCG variants:
+Fixes: db0c51a3803
 tcg_cpus_rr (single threaded, round robin cpus)
 tcg_cpus_icount (same as rr, but with instruction counting enabled)
 tcg_cpus_mttcg (multi-threaded cpus)
 Suggested-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20201015143217.29337-2-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-cpus-icount.h |  17 ++
+ tcg/tcg.c | 20 ++++++++++++++++++--
- accel/tcg/tcg-cpus-mttcg.h  |  21 ++
+file changed, 18 insertions(+), 2 deletions(-)
  accel/tcg/tcg-cpus-rr.h     |  20 ++
  accel/tcg/tcg-cpus.h        |  13 +-
  accel/tcg/tcg-all.c         |   8 +-
  accel/tcg/tcg-cpus-icount.c | 147 +++++++++++
  accel/tcg/tcg-cpus-mttcg.c  | 117 +++++++++
  accel/tcg/tcg-cpus-rr.c     | 270 ++++++++++++++++++++
  accel/tcg/tcg-cpus.c        | 484 ++----------------------------------
  softmmu/icount.c            |   2 +-
  accel/tcg/meson.build       |   9 +-
 files changed, 646 insertions(+), 462 deletions(-)
  create mode 100644 accel/tcg/tcg-cpus-icount.h
  create mode 100644 accel/tcg/tcg-cpus-mttcg.h
  create mode 100644 accel/tcg/tcg-cpus-rr.h
  create mode 100644 accel/tcg/tcg-cpus-icount.c
  create mode 100644 accel/tcg/tcg-cpus-mttcg.c
  create mode 100644 accel/tcg/tcg-cpus-rr.c
-diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
+diff --git a/tcg/tcg.c b/tcg/tcg.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-icount.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation using instruction counting
 + *
 + * Copyright 2020 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef TCG_CPUS_ICOUNT_H
 +#define TCG_CPUS_ICOUNT_H
 +
 +void handle_icount_deadline(void);
 +void prepare_icount_for_run(CPUState *cpu);
 +void process_icount_data(CPUState *cpu);
 +
 +#endif /* TCG_CPUS_ICOUNT_H */
 diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-mttcg.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Multi Threaded vCPUs implementation
 + *
 + * Copyright 2020 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef TCG_CPUS_MTTCG_H
 +#define TCG_CPUS_MTTCG_H
 +
 +/*
 + * In the multi-threaded case each vCPU has its own thread. The TLS
 + * variable current_cpu can be used deep in the code to find the
 + * current CPUState for a given thread.
 + */
 +
 +void *tcg_cpu_thread_fn(void *arg);
 +
 +#endif /* TCG_CPUS_MTTCG_H */
 diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation
 + *
 + * Copyright 2020 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef TCG_CPUS_RR_H
 +#define TCG_CPUS_RR_H
 +
 +#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 +
 +/* Kick all RR vCPUs. */
 +void qemu_cpu_kick_rr_cpus(CPUState *unused);
 +
 +void *tcg_rr_cpu_thread_fn(void *arg);
 +
 +#endif /* TCG_CPUS_RR_H */
 diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-cpus.h
+--- a/tcg/tcg.c
-+++ b/accel/tcg/tcg-cpus.h
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void tcg_region_trees_init(void)
- /*
+     }
 - * Accelerator CPUS Interface
 + * QEMU TCG vCPU common functionality
 + *
 + * Functionality common to all TCG vcpu variants: mttcg, rr and icount.
   *
   * Copyright 2020 SUSE LLC
   *
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/cpus.h"
 -extern const CpusAccel tcg_cpus;
 +extern const CpusAccel tcg_cpus_mttcg;
 +extern const CpusAccel tcg_cpus_icount;
 +extern const CpusAccel tcg_cpus_rr;
 +
 +void tcg_start_vcpu_thread(CPUState *cpu);
 +void qemu_tcg_destroy_vcpu(CPUState *cpu);
 +int tcg_cpu_exec(CPUState *cpu);
 +void tcg_handle_interrupt(CPUState *cpu, int mask);
  #endif /* TCG_CPUS_H */
 diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-all.c
 +++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
      tcg_exec_init(s->tb_size * 1024 * 1024);
      mttcg_enabled = s->mttcg_enabled;
 -    cpus_register_accel(&tcg_cpus);
 +    if (mttcg_enabled) {
 +        cpus_register_accel(&tcg_cpus_mttcg);
 +    } else if (icount_enabled()) {
 +        cpus_register_accel(&tcg_cpus_icount);
 +    } else {
 +        cpus_register_accel(&tcg_cpus_rr);
 +    }
      return 0;
  }
-diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
+-static struct tcg_region_tree *tc_ptr_to_region_tree(const void *cp)
-new file mode 100644
++static struct tcg_region_tree *tc_ptr_to_region_tree(const void *p)
-index XXXXXXX..XXXXXXX
+ {
---- /dev/null
+-    void *p = tcg_splitwx_to_rw(cp);
-+++ b/accel/tcg/tcg-cpus-icount.c
+     size_t region_idx;
-@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation using instruction counting
 + *
 + * Copyright (c) 2003-2008 Fabrice Bellard
 + * Copyright (c) 2014 Red Hat Inc.
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu-common.h"
 +#include "sysemu/tcg.h"
 +#include "sysemu/replay.h"
 +#include "qemu/main-loop.h"
 +#include "qemu/guest-random.h"
 +#include "exec/exec-all.h"
 +#include "hw/boards.h"
 +
 +#include "tcg-cpus.h"
 +#include "tcg-cpus-icount.h"
 +#include "tcg-cpus-rr.h"
 +
 +static int64_t tcg_get_icount_limit(void)
 +{
 +    int64_t deadline;
 +
 +    if (replay_mode != REPLAY_MODE_PLAY) {
 +        /*
 +         * Include all the timers, because they may need an attention.
 +         * Too long CPU execution may create unnecessary delay in UI.
 +         */
 +        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 +                                              QEMU_TIMER_ATTR_ALL);
 +        /* Check realtime timers, because they help with input processing */
 +        deadline = qemu_soonest_timeout(deadline,
 +                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
 +                                           QEMU_TIMER_ATTR_ALL));
 +
 +        /*
 +         * Maintain prior (possibly buggy) behaviour where if no deadline
 +         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
 +         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
 +         * nanoseconds.
 +         */
 +        if ((deadline < 0) || (deadline > INT32_MAX)) {
 +            deadline = INT32_MAX;
 +        }
 +
 +        return icount_round(deadline);
 +    } else {
 +        return replay_get_instructions();
 +    }
 +}
 +
 +static void notify_aio_contexts(void)
 +{
 +    /* Wake up other AioContexts.  */
 +    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 +    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 +}
 +
 +void handle_icount_deadline(void)
 +{
 +    assert(qemu_in_vcpu_thread());
 +    int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 +                                                  QEMU_TIMER_ATTR_ALL);
 +
 +    if (deadline == 0) {
 +        notify_aio_contexts();
 +    }
 +}
 +
 +void prepare_icount_for_run(CPUState *cpu)
 +{
 +    int insns_left;
 +
 +    /*
-+     * These should always be cleared by process_icount_data after
++     * Like tcg_splitwx_to_rw, with no assert.  The pc may come from
-+     * each vCPU execution. However u16.high can be raised
++     * a signal handler over which the caller has no control.
 +     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
 +     */
-+    g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
++    if (!in_code_gen_buffer(p)) {
-+    g_assert(cpu->icount_extra == 0);
++        p -= tcg_splitwx_diff;
-+
++        if (!in_code_gen_buffer(p)) {
-+    cpu->icount_budget = tcg_get_icount_limit();
++            return NULL;
 +    insns_left = MIN(0xffff, cpu->icount_budget);
 +    cpu_neg(cpu)->icount_decr.u16.low = insns_left;
 +    cpu->icount_extra = cpu->icount_budget - insns_left;
 +
 +    replay_mutex_lock();
 +
 +    if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
 +        notify_aio_contexts();
 +    }
 +}
 +
 +void process_icount_data(CPUState *cpu)
 +{
 +    /* Account for executed instructions */
 +    icount_update(cpu);
 +
 +    /* Reset the counters */
 +    cpu_neg(cpu)->icount_decr.u16.low = 0;
 +    cpu->icount_extra = 0;
 +    cpu->icount_budget = 0;
 +
 +    replay_account_executed_instructions();
 +
 +    replay_mutex_unlock();
 +}
 +
 +static void icount_handle_interrupt(CPUState *cpu, int mask)
 +{
 +    int old_mask = cpu->interrupt_request;
 +
 +    tcg_handle_interrupt(cpu, mask);
 +    if (qemu_cpu_is_self(cpu) &&
 +        !cpu->can_do_io
 +        && (mask & ~old_mask) != 0) {
 +        cpu_abort(cpu, "Raised interrupt while not in I/O function");
 +    }
 +}
 +
 +const CpusAccel tcg_cpus_icount = {
 +    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 +
 +    .handle_interrupt = icount_handle_interrupt,
 +    .get_virtual_clock = icount_get,
 +    .get_elapsed_ticks = icount_get,
 +};
 diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Multi Threaded vCPUs implementation
 + *
 + * Copyright (c) 2003-2008 Fabrice Bellard
 + * Copyright (c) 2014 Red Hat Inc.
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu-common.h"
 +#include "sysemu/tcg.h"
 +#include "sysemu/replay.h"
 +#include "qemu/main-loop.h"
 +#include "qemu/guest-random.h"
 +#include "exec/exec-all.h"
 +#include "hw/boards.h"
 +
 +#include "tcg-cpus.h"
 +#include "tcg-cpus-mttcg.h"
 +
 +/*
 + * In the multi-threaded case each vCPU has its own thread. The TLS
 + * variable current_cpu can be used deep in the code to find the
 + * current CPUState for a given thread.
 + */
 +
 +void *tcg_cpu_thread_fn(void *arg)
 +{
 +    CPUState *cpu = arg;
 +
 +    assert(tcg_enabled());
 +    g_assert(!icount_enabled());
 +
 +    rcu_register_thread();
 +    tcg_register_thread();
 +
 +    qemu_mutex_lock_iothread();
 +    qemu_thread_get_self(cpu->thread);
 +
 +    cpu->thread_id = qemu_get_thread_id();
 +    cpu->can_do_io = 1;
 +    current_cpu = cpu;
 +    cpu_thread_signal_created(cpu);
 +    qemu_guest_random_seed_thread_part2(cpu->random_seed);
 +
 +    /* process any pending work */
 +    cpu->exit_request = 1;
 +
 +    do {
 +        if (cpu_can_run(cpu)) {
 +            int r;
 +            qemu_mutex_unlock_iothread();
 +            r = tcg_cpu_exec(cpu);
 +            qemu_mutex_lock_iothread();
 +            switch (r) {
 +            case EXCP_DEBUG:
 +                cpu_handle_guest_debug(cpu);
 +                break;
 +            case EXCP_HALTED:
 +                /*
 +                 * during start-up the vCPU is reset and the thread is
 +                 * kicked several times. If we don't ensure we go back
 +                 * to sleep in the halted state we won't cleanly
 +                 * start-up when the vCPU is enabled.
 +                 *
 +                 * cpu->halted should ensure we sleep in wait_io_event
 +                 */
 +                g_assert(cpu->halted);
 +                break;
 +            case EXCP_ATOMIC:
 +                qemu_mutex_unlock_iothread();
 +                cpu_exec_step_atomic(cpu);
 +                qemu_mutex_lock_iothread();
 +            default:
 +                /* Ignore everything else? */
 +                break;
 +            }
 +        }
 +
 +        qatomic_mb_set(&cpu->exit_request, 0);
 +        qemu_wait_io_event(cpu);
 +    } while (!cpu->unplug || cpu_can_run(cpu));
 +
 +    qemu_tcg_destroy_vcpu(cpu);
 +    qemu_mutex_unlock_iothread();
 +    rcu_unregister_thread();
 +    return NULL;
 +}
 +
 +static void mttcg_kick_vcpu_thread(CPUState *cpu)
 +{
 +    cpu_exit(cpu);
 +}
 +
 +const CpusAccel tcg_cpus_mttcg = {
 +    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 +
 +    .handle_interrupt = tcg_handle_interrupt,
 +};
 diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation
 + *
 + * Copyright (c) 2003-2008 Fabrice Bellard
 + * Copyright (c) 2014 Red Hat Inc.
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu-common.h"
 +#include "sysemu/tcg.h"
 +#include "sysemu/replay.h"
 +#include "qemu/main-loop.h"
 +#include "qemu/guest-random.h"
 +#include "exec/exec-all.h"
 +#include "hw/boards.h"
 +
 +#include "tcg-cpus.h"
 +#include "tcg-cpus-rr.h"
 +#include "tcg-cpus-icount.h"
 +
 +/* Kick all RR vCPUs */
 +void qemu_cpu_kick_rr_cpus(CPUState *unused)
 +{
 +    CPUState *cpu;
 +
 +    CPU_FOREACH(cpu) {
 +        cpu_exit(cpu);
 +    };
 +}
 +
 +/*
 + * TCG vCPU kick timer
 + *
 + * The kick timer is responsible for moving single threaded vCPU
 + * emulation on to the next vCPU. If more than one vCPU is running a
 + * timer event with force a cpu->exit so the next vCPU can get
 + * scheduled.
 + *
 + * The timer is removed if all vCPUs are idle and restarted again once
 + * idleness is complete.
 + */
 +
 +static QEMUTimer *tcg_kick_vcpu_timer;
 +static CPUState *tcg_current_rr_cpu;
 +
 +#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 +
 +static inline int64_t qemu_tcg_next_kick(void)
 +{
 +    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 +}
 +
 +/* Kick the currently round-robin scheduled vCPU to next */
 +static void qemu_cpu_kick_rr_next_cpu(void)
 +{
 +    CPUState *cpu;
 +    do {
 +        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
 +        if (cpu) {
 +            cpu_exit(cpu);
 +        }
 +    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
 +}
 +
 +static void kick_tcg_thread(void *opaque)
 +{
 +    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 +    qemu_cpu_kick_rr_next_cpu();
 +}
 +
 +static void start_tcg_kick_timer(void)
 +{
 +    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 +        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 +                                           kick_tcg_thread, NULL);
 +    }
 +    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 +        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 +    }
 +}
 +
 +static void stop_tcg_kick_timer(void)
 +{
 +    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 +        timer_del(tcg_kick_vcpu_timer);
 +    }
 +}
 +
 +static void qemu_tcg_rr_wait_io_event(void)
 +{
 +    CPUState *cpu;
 +
 +    while (all_cpu_threads_idle()) {
 +        stop_tcg_kick_timer();
 +        qemu_cond_wait_iothread(first_cpu->halt_cond);
 +    }
 +
 +    start_tcg_kick_timer();
 +
 +    CPU_FOREACH(cpu) {
 +        qemu_wait_io_event_common(cpu);
 +    }
 +}
 +
 +/*
 + * Destroy any remaining vCPUs which have been unplugged and have
 + * finished running
 + */
 +static void deal_with_unplugged_cpus(void)
 +{
 +    CPUState *cpu;
 +
 +    CPU_FOREACH(cpu) {
 +        if (cpu->unplug && !cpu_can_run(cpu)) {
 +            qemu_tcg_destroy_vcpu(cpu);
 +            break;
 +        }
 +    }
 +}
 +
 +/*
 + * In the single-threaded case each vCPU is simulated in turn. If
 + * there is more than a single vCPU we create a simple timer to kick
 + * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
 + * This is done explicitly rather than relying on side-effects
 + * elsewhere.
 + */
 +
 +void *tcg_rr_cpu_thread_fn(void *arg)
 +{
 +    CPUState *cpu = arg;
 +
 +    assert(tcg_enabled());
 +    rcu_register_thread();
 +    tcg_register_thread();
 +
 +    qemu_mutex_lock_iothread();
 +    qemu_thread_get_self(cpu->thread);
 +
 +    cpu->thread_id = qemu_get_thread_id();
 +    cpu->can_do_io = 1;
 +    cpu_thread_signal_created(cpu);
 +    qemu_guest_random_seed_thread_part2(cpu->random_seed);
 +
 +    /* wait for initial kick-off after machine start */
 +    while (first_cpu->stopped) {
 +        qemu_cond_wait_iothread(first_cpu->halt_cond);
 +
 +        /* process any pending work */
 +        CPU_FOREACH(cpu) {
 +            current_cpu = cpu;
 +            qemu_wait_io_event_common(cpu);
 +        }
 +    }
 +
-+    start_tcg_kick_timer();
+     if (p < region.start_aligned) {
-+
+         region_idx = 0;
-+    cpu = first_cpu;
+     } else {
-+
+@@ -XXX,XX +XXX,XX @@ void tcg_tb_insert(TranslationBlock *tb)
-+    /* process any pending work */
+ {
-+    cpu->exit_request = 1;
+     struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
-+
-+    while (1) {
++    g_assert(rt != NULL);
-+        qemu_mutex_unlock_iothread();
+     qemu_mutex_lock(&rt->lock);
-+        replay_mutex_lock();
+     g_tree_insert(rt->tree, &tb->tc, tb);
-+        qemu_mutex_lock_iothread();
+     qemu_mutex_unlock(&rt->lock);
-+
+@@ -XXX,XX +XXX,XX @@ void tcg_tb_remove(TranslationBlock *tb)
-+        if (icount_enabled()) {
+ {
-+            /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
+     struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
-+            icount_account_warp_timer();
-+            /*
++    g_assert(rt != NULL);
-+             * Run the timers here.  This is much more efficient than
+     qemu_mutex_lock(&rt->lock);
-+             * waking up the I/O thread and waiting for completion.
+     g_tree_remove(rt->tree, &tb->tc);
-+             */
+     qemu_mutex_unlock(&rt->lock);
-+            handle_icount_deadline();
+@@ -XXX,XX +XXX,XX @@ TranslationBlock *tcg_tb_lookup(uintptr_t tc_ptr)
-+        }
+     TranslationBlock *tb;
-+
+     struct tb_tc s = { .ptr = (void *)tc_ptr };
-+        replay_mutex_unlock();
-+
++    if (rt == NULL) {
-+        if (!cpu) {
++        return NULL;
 +            cpu = first_cpu;
 +        }
 +
 +        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 +
 +            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
 +            current_cpu = cpu;
 +
 +            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
 +                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
 +
 +            if (cpu_can_run(cpu)) {
 +                int r;
 +
 +                qemu_mutex_unlock_iothread();
 +                if (icount_enabled()) {
 +                    prepare_icount_for_run(cpu);
 +                }
 +                r = tcg_cpu_exec(cpu);
 +                if (icount_enabled()) {
 +                    process_icount_data(cpu);
 +                }
 +                qemu_mutex_lock_iothread();
 +
 +                if (r == EXCP_DEBUG) {
 +                    cpu_handle_guest_debug(cpu);
 +                    break;
 +                } else if (r == EXCP_ATOMIC) {
 +                    qemu_mutex_unlock_iothread();
 +                    cpu_exec_step_atomic(cpu);
 +                    qemu_mutex_lock_iothread();
 +                    break;
 +                }
 +            } else if (cpu->stop) {
 +                if (cpu->unplug) {
 +                    cpu = CPU_NEXT(cpu);
 +                }
 +                break;
 +            }
 +
 +            cpu = CPU_NEXT(cpu);
 +        } /* while (cpu && !cpu->exit_request).. */
 +
 +        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
 +        qatomic_set(&tcg_current_rr_cpu, NULL);
 +
 +        if (cpu && cpu->exit_request) {
 +            qatomic_mb_set(&cpu->exit_request, 0);
 +        }
 +
 +        if (icount_enabled() && all_cpu_threads_idle()) {
 +            /*
 +             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
 +             * in the main_loop, wake it up in order to start the warp timer.
 +             */
 +            qemu_notify_event();
 +        }
 +
 +        qemu_tcg_rr_wait_io_event();
 +        deal_with_unplugged_cpus();
 +    }
 +
-+    rcu_unregister_thread();
+     qemu_mutex_lock(&rt->lock);
-+    return NULL;
+     tb = g_tree_lookup(rt->tree, &s);
-+}
+     qemu_mutex_unlock(&rt->lock);
 +
 +const CpusAccel tcg_cpus_rr = {
 +    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 +
 +    .handle_interrupt = tcg_handle_interrupt,
 +};
 diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.c
 +++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
  /*
 - * QEMU System Emulator
 + * QEMU TCG vCPU common functionality
 + *
 + * Functionality common to all TCG vCPU variants: mttcg, rr and icount.
   *
   * Copyright (c) 2003-2008 Fabrice Bellard
   * Copyright (c) 2014 Red Hat Inc.
@@ -XXX,XX +XXX,XX @@
  #include "hw/boards.h"
  #include "tcg-cpus.h"
 +#include "tcg-cpus-mttcg.h"
 +#include "tcg-cpus-rr.h"
 -/* Kick all RR vCPUs */
 -static void qemu_cpu_kick_rr_cpus(void)
 -{
 -    CPUState *cpu;
 +/* common functionality among all TCG variants */
 -    CPU_FOREACH(cpu) {
 -        cpu_exit(cpu);
 -    };
 -}
 -
 -static void tcg_kick_vcpu_thread(CPUState *cpu)
 -{
 -    if (qemu_tcg_mttcg_enabled()) {
 -        cpu_exit(cpu);
 -    } else {
 -        qemu_cpu_kick_rr_cpus();
 -    }
 -}
 -
 -/*
 - * TCG vCPU kick timer
 - *
 - * The kick timer is responsible for moving single threaded vCPU
 - * emulation on to the next vCPU. If more than one vCPU is running a
 - * timer event with force a cpu->exit so the next vCPU can get
 - * scheduled.
 - *
 - * The timer is removed if all vCPUs are idle and restarted again once
 - * idleness is complete.
 - */
 -
 -static QEMUTimer *tcg_kick_vcpu_timer;
 -static CPUState *tcg_current_rr_cpu;
 -
 -#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 -
 -static inline int64_t qemu_tcg_next_kick(void)
 -{
 -    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 -}
 -
 -/* Kick the currently round-robin scheduled vCPU to next */
 -static void qemu_cpu_kick_rr_next_cpu(void)
 -{
 -    CPUState *cpu;
 -    do {
 -        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
 -        if (cpu) {
 -            cpu_exit(cpu);
 -        }
 -    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
 -}
 -
 -static void kick_tcg_thread(void *opaque)
 -{
 -    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 -    qemu_cpu_kick_rr_next_cpu();
 -}
 -
 -static void start_tcg_kick_timer(void)
 -{
 -    assert(!mttcg_enabled);
 -    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 -        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 -                                           kick_tcg_thread, NULL);
 -    }
 -    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 -    }
 -}
 -
 -static void stop_tcg_kick_timer(void)
 -{
 -    assert(!mttcg_enabled);
 -    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_del(tcg_kick_vcpu_timer);
 -    }
 -}
 -
 -static void qemu_tcg_destroy_vcpu(CPUState *cpu)
 -{
 -}
 -
 -static void qemu_tcg_rr_wait_io_event(void)
 -{
 -    CPUState *cpu;
 -
 -    while (all_cpu_threads_idle()) {
 -        stop_tcg_kick_timer();
 -        qemu_cond_wait_iothread(first_cpu->halt_cond);
 -    }
 -
 -    start_tcg_kick_timer();
 -
 -    CPU_FOREACH(cpu) {
 -        qemu_wait_io_event_common(cpu);
 -    }
 -}
 -
 -static int64_t tcg_get_icount_limit(void)
 -{
 -    int64_t deadline;
 -
 -    if (replay_mode != REPLAY_MODE_PLAY) {
 -        /*
 -         * Include all the timers, because they may need an attention.
 -         * Too long CPU execution may create unnecessary delay in UI.
 -         */
 -        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 -                                              QEMU_TIMER_ATTR_ALL);
 -        /* Check realtime timers, because they help with input processing */
 -        deadline = qemu_soonest_timeout(deadline,
 -                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
 -                                           QEMU_TIMER_ATTR_ALL));
 -
 -        /*
 -         * Maintain prior (possibly buggy) behaviour where if no deadline
 -         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
 -         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
 -         * nanoseconds.
 -         */
 -        if ((deadline < 0) || (deadline > INT32_MAX)) {
 -            deadline = INT32_MAX;
 -        }
 -
 -        return icount_round(deadline);
 -    } else {
 -        return replay_get_instructions();
 -    }
 -}
 -
 -static void notify_aio_contexts(void)
 -{
 -    /* Wake up other AioContexts.  */
 -    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 -    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 -}
 -
 -static void handle_icount_deadline(void)
 -{
 -    assert(qemu_in_vcpu_thread());
 -    if (icount_enabled()) {
 -        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 -                                                      QEMU_TIMER_ATTR_ALL);
 -
 -        if (deadline == 0) {
 -            notify_aio_contexts();
 -        }
 -    }
 -}
 -
 -static void prepare_icount_for_run(CPUState *cpu)
 -{
 -    if (icount_enabled()) {
 -        int insns_left;
 -
 -        /*
 -         * These should always be cleared by process_icount_data after
 -         * each vCPU execution. However u16.high can be raised
 -         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
 -         */
 -        g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
 -        g_assert(cpu->icount_extra == 0);
 -
 -        cpu->icount_budget = tcg_get_icount_limit();
 -        insns_left = MIN(0xffff, cpu->icount_budget);
 -        cpu_neg(cpu)->icount_decr.u16.low = insns_left;
 -        cpu->icount_extra = cpu->icount_budget - insns_left;
 -
 -        replay_mutex_lock();
 -
 -        if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
 -            notify_aio_contexts();
 -        }
 -    }
 -}
 -
 -static void process_icount_data(CPUState *cpu)
 -{
 -    if (icount_enabled()) {
 -        /* Account for executed instructions */
 -        icount_update(cpu);
 -
 -        /* Reset the counters */
 -        cpu_neg(cpu)->icount_decr.u16.low = 0;
 -        cpu->icount_extra = 0;
 -        cpu->icount_budget = 0;
 -
 -        replay_account_executed_instructions();
 -
 -        replay_mutex_unlock();
 -    }
 -}
 -
 -static int tcg_cpu_exec(CPUState *cpu)
 -{
 -    int ret;
 -#ifdef CONFIG_PROFILER
 -    int64_t ti;
 -#endif
 -
 -    assert(tcg_enabled());
 -#ifdef CONFIG_PROFILER
 -    ti = profile_getclock();
 -#endif
 -    cpu_exec_start(cpu);
 -    ret = cpu_exec(cpu);
 -    cpu_exec_end(cpu);
 -#ifdef CONFIG_PROFILER
 -    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
 -                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
 -#endif
 -    return ret;
 -}
 -
 -/*
 - * Destroy any remaining vCPUs which have been unplugged and have
 - * finished running
 - */
 -static void deal_with_unplugged_cpus(void)
 -{
 -    CPUState *cpu;
 -
 -    CPU_FOREACH(cpu) {
 -        if (cpu->unplug && !cpu_can_run(cpu)) {
 -            qemu_tcg_destroy_vcpu(cpu);
 -            cpu_thread_signal_destroyed(cpu);
 -            break;
 -        }
 -    }
 -}
 -
 -/*
 - * Single-threaded TCG
 - *
 - * In the single-threaded case each vCPU is simulated in turn. If
 - * there is more than a single vCPU we create a simple timer to kick
 - * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
 - * This is done explicitly rather than relying on side-effects
 - * elsewhere.
 - */
 -
 -static void *tcg_rr_cpu_thread_fn(void *arg)
 -{
 -    CPUState *cpu = arg;
 -
 -    assert(tcg_enabled());
 -    rcu_register_thread();
 -    tcg_register_thread();
 -
 -    qemu_mutex_lock_iothread();
 -    qemu_thread_get_self(cpu->thread);
 -
 -    cpu->thread_id = qemu_get_thread_id();
 -    cpu->can_do_io = 1;
 -    cpu_thread_signal_created(cpu);
 -    qemu_guest_random_seed_thread_part2(cpu->random_seed);
 -
 -    /* wait for initial kick-off after machine start */
 -    while (first_cpu->stopped) {
 -        qemu_cond_wait_iothread(first_cpu->halt_cond);
 -
 -        /* process any pending work */
 -        CPU_FOREACH(cpu) {
 -            current_cpu = cpu;
 -            qemu_wait_io_event_common(cpu);
 -        }
 -    }
 -
 -    start_tcg_kick_timer();
 -
 -    cpu = first_cpu;
 -
 -    /* process any pending work */
 -    cpu->exit_request = 1;
 -
 -    while (1) {
 -        qemu_mutex_unlock_iothread();
 -        replay_mutex_lock();
 -        qemu_mutex_lock_iothread();
 -        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
 -        icount_account_warp_timer();
 -
 -        /*
 -         * Run the timers here.  This is much more efficient than
 -         * waking up the I/O thread and waiting for completion.
 -         */
 -        handle_icount_deadline();
 -
 -        replay_mutex_unlock();
 -
 -        if (!cpu) {
 -            cpu = first_cpu;
 -        }
 -
 -        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 -
 -            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
 -            current_cpu = cpu;
 -
 -            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
 -                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
 -
 -            if (cpu_can_run(cpu)) {
 -                int r;
 -
 -                qemu_mutex_unlock_iothread();
 -                prepare_icount_for_run(cpu);
 -
 -                r = tcg_cpu_exec(cpu);
 -
 -                process_icount_data(cpu);
 -                qemu_mutex_lock_iothread();
 -
 -                if (r == EXCP_DEBUG) {
 -                    cpu_handle_guest_debug(cpu);
 -                    break;
 -                } else if (r == EXCP_ATOMIC) {
 -                    qemu_mutex_unlock_iothread();
 -                    cpu_exec_step_atomic(cpu);
 -                    qemu_mutex_lock_iothread();
 -                    break;
 -                }
 -            } else if (cpu->stop) {
 -                if (cpu->unplug) {
 -                    cpu = CPU_NEXT(cpu);
 -                }
 -                break;
 -            }
 -
 -            cpu = CPU_NEXT(cpu);
 -        } /* while (cpu && !cpu->exit_request).. */
 -
 -        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
 -        qatomic_set(&tcg_current_rr_cpu, NULL);
 -
 -        if (cpu && cpu->exit_request) {
 -            qatomic_mb_set(&cpu->exit_request, 0);
 -        }
 -
 -        if (icount_enabled() && all_cpu_threads_idle()) {
 -            /*
 -             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
 -             * in the main_loop, wake it up in order to start the warp timer.
 -             */
 -            qemu_notify_event();
 -        }
 -
 -        qemu_tcg_rr_wait_io_event();
 -        deal_with_unplugged_cpus();
 -    }
 -
 -    rcu_unregister_thread();
 -    return NULL;
 -}
 -
 -/*
 - * Multi-threaded TCG
 - *
 - * In the multi-threaded case each vCPU has its own thread. The TLS
 - * variable current_cpu can be used deep in the code to find the
 - * current CPUState for a given thread.
 - */
 -
 -static void *tcg_cpu_thread_fn(void *arg)
 -{
 -    CPUState *cpu = arg;
 -
 -    assert(tcg_enabled());
 -    g_assert(!icount_enabled());
 -
 -    rcu_register_thread();
 -    tcg_register_thread();
 -
 -    qemu_mutex_lock_iothread();
 -    qemu_thread_get_self(cpu->thread);
 -
 -    cpu->thread_id = qemu_get_thread_id();
 -    cpu->can_do_io = 1;
 -    current_cpu = cpu;
 -    cpu_thread_signal_created(cpu);
 -    qemu_guest_random_seed_thread_part2(cpu->random_seed);
 -
 -    /* process any pending work */
 -    cpu->exit_request = 1;
 -
 -    do {
 -        if (cpu_can_run(cpu)) {
 -            int r;
 -            qemu_mutex_unlock_iothread();
 -            r = tcg_cpu_exec(cpu);
 -            qemu_mutex_lock_iothread();
 -            switch (r) {
 -            case EXCP_DEBUG:
 -                cpu_handle_guest_debug(cpu);
 -                break;
 -            case EXCP_HALTED:
 -                /*
 -                 * during start-up the vCPU is reset and the thread is
 -                 * kicked several times. If we don't ensure we go back
 -                 * to sleep in the halted state we won't cleanly
 -                 * start-up when the vCPU is enabled.
 -                 *
 -                 * cpu->halted should ensure we sleep in wait_io_event
 -                 */
 -                g_assert(cpu->halted);
 -                break;
 -            case EXCP_ATOMIC:
 -                qemu_mutex_unlock_iothread();
 -                cpu_exec_step_atomic(cpu);
 -                qemu_mutex_lock_iothread();
 -            default:
 -                /* Ignore everything else? */
 -                break;
 -            }
 -        }
 -
 -        qatomic_mb_set(&cpu->exit_request, 0);
 -        qemu_wait_io_event(cpu);
 -    } while (!cpu->unplug || cpu_can_run(cpu));
 -
 -    qemu_tcg_destroy_vcpu(cpu);
 -    cpu_thread_signal_destroyed(cpu);
 -    qemu_mutex_unlock_iothread();
 -    rcu_unregister_thread();
 -    return NULL;
 -}
 -
 -static void tcg_start_vcpu_thread(CPUState *cpu)
 +void tcg_start_vcpu_thread(CPUState *cpu)
  {
      char thread_name[VCPU_THREAD_NAME_SIZE];
      static QemuCond *single_tcg_halt_cond;
@@ -XXX,XX +XXX,XX @@ static void tcg_start_vcpu_thread(CPUState *cpu)
      }
  }
 -static int64_t tcg_get_virtual_clock(void)
 +void qemu_tcg_destroy_vcpu(CPUState *cpu)
  {
 -    if (icount_enabled()) {
 -        return icount_get();
 -    }
 -    return cpu_get_clock();
 +    cpu_thread_signal_destroyed(cpu);
  }
 -static int64_t tcg_get_elapsed_ticks(void)
 +int tcg_cpu_exec(CPUState *cpu)
  {
 -    if (icount_enabled()) {
 -        return icount_get();
 -    }
 -    return cpu_get_ticks();
 +    int ret;
 +#ifdef CONFIG_PROFILER
 +    int64_t ti;
 +#endif
 +    assert(tcg_enabled());
 +#ifdef CONFIG_PROFILER
 +    ti = profile_getclock();
 +#endif
 +    cpu_exec_start(cpu);
 +    ret = cpu_exec(cpu);
 +    cpu_exec_end(cpu);
 +#ifdef CONFIG_PROFILER
 +    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
 +                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
 +#endif
 +    return ret;
  }
  /* mask must never be zero, except for A20 change call */
 -static void tcg_handle_interrupt(CPUState *cpu, int mask)
 +void tcg_handle_interrupt(CPUState *cpu, int mask)
  {
 -    int old_mask;
      g_assert(qemu_mutex_iothread_locked());
 -    old_mask = cpu->interrupt_request;
      cpu->interrupt_request |= mask;
      /*
@@ -XXX,XX +XXX,XX @@ static void tcg_handle_interrupt(CPUState *cpu, int mask)
          qemu_cpu_kick(cpu);
      } else {
          qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
 -        if (icount_enabled() &&
 -            !cpu->can_do_io
 -            && (mask & ~old_mask) != 0) {
 -            cpu_abort(cpu, "Raised interrupt while not in I/O function");
 -        }
      }
  }
 -
 -const CpusAccel tcg_cpus = {
 -    .create_vcpu_thread = tcg_start_vcpu_thread,
 -    .kick_vcpu_thread = tcg_kick_vcpu_thread,
 -
 -    .handle_interrupt = tcg_handle_interrupt,
 -
 -    .get_virtual_clock = tcg_get_virtual_clock,
 -    .get_elapsed_ticks = tcg_get_elapsed_ticks,
 -};
 diff --git a/softmmu/icount.c b/softmmu/icount.c
 index XXXXXXX..XXXXXXX 100644
 --- a/softmmu/icount.c
 +++ b/softmmu/icount.c
@@ -XXX,XX +XXX,XX @@ void icount_start_warp_timer(void)
  void icount_account_warp_timer(void)
  {
 -    if (!icount_enabled() || !icount_sleep) {
 +    if (!icount_sleep) {
          return;
      }
 diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/meson.build
 +++ b/accel/tcg/meson.build
@@ -XXX,XX +XXX,XX @@ tcg_ss.add(when: 'CONFIG_SOFTMMU', if_false: files('user-exec-stub.c'))
  tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c'), libdl])
  specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
 -specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files('tcg-all.c', 'cputlb.c', 'tcg-cpus.c'))
 +specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
 +  'tcg-all.c',
 +  'cputlb.c',
 +  'tcg-cpus.c',
 +  'tcg-cpus-mttcg.c',
 +  'tcg-cpus-icount.c',
 +  'tcg-cpus-rr.c'
 +))
 --
 .25.1

-New patch
+[PATCH 07/27] tcg/tci: Merge identical cases in generation (arithmetic opcodes)
+Use CASE_32_64 and CASE_64 to reduce ifdefs and merge
+cases that are identical between 32-bit and 64-bit hosts.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Message-Id: <20210217202036.1724901-5-richard.henderson@linaro.org>
+[PMD: Split patch as 1/5]
+Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Message-Id: <20210218232840.1760806-2-f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci/tcg-target.c.inc | 85 +++++++++++++++++-----------------------
+file changed, 37 insertions(+), 48 deletions(-)
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.c.inc
++++ b/tcg/tci/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
+     old_code_ptr[1] = s->code_ptr - old_code_ptr;
+ }
++#if TCG_TARGET_REG_BITS == 64
++# define CASE_32_64(x) \
++        case glue(glue(INDEX_op_, x), _i64): \
++        case glue(glue(INDEX_op_, x), _i32):
++# define CASE_64(x) \
++        case glue(glue(INDEX_op_, x), _i64):
++#else
++# define CASE_32_64(x) \
++        case glue(glue(INDEX_op_, x), _i32):
++# define CASE_64(x)
++#endif
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+                        const int *const_args)
+ {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+     case INDEX_op_exit_tb:
+         tcg_out64(s, args[0]);
+         break;
++
+     case INDEX_op_goto_tb:
+         if (s->tb_jmp_insn_offset) {
+             /* Direct jump method. */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+         tcg_debug_assert(args[2] == (int32_t)args[2]);
+         tcg_out32(s, args[2]);
+         break;
+-    case INDEX_op_add_i32:
+-    case INDEX_op_sub_i32:
+-    case INDEX_op_mul_i32:
+-    case INDEX_op_and_i32:
+-    case INDEX_op_andc_i32:     /* Optional (TCG_TARGET_HAS_andc_i32). */
+-    case INDEX_op_eqv_i32:      /* Optional (TCG_TARGET_HAS_eqv_i32). */
+-    case INDEX_op_nand_i32:     /* Optional (TCG_TARGET_HAS_nand_i32). */
+-    case INDEX_op_nor_i32:      /* Optional (TCG_TARGET_HAS_nor_i32). */
+-    case INDEX_op_or_i32:
+-    case INDEX_op_orc_i32:      /* Optional (TCG_TARGET_HAS_orc_i32). */
+-    case INDEX_op_xor_i32:
+-    case INDEX_op_shl_i32:
+-    case INDEX_op_shr_i32:
+-    case INDEX_op_sar_i32:
+-    case INDEX_op_rotl_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
+-    case INDEX_op_rotr_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
++
++    CASE_32_64(add)
++    CASE_32_64(sub)
++    CASE_32_64(mul)
++    CASE_32_64(and)
++    CASE_32_64(or)
++    CASE_32_64(xor)
++    CASE_32_64(andc)     /* Optional (TCG_TARGET_HAS_andc_*). */
++    CASE_32_64(orc)      /* Optional (TCG_TARGET_HAS_orc_*). */
++    CASE_32_64(eqv)      /* Optional (TCG_TARGET_HAS_eqv_*). */
++    CASE_32_64(nand)     /* Optional (TCG_TARGET_HAS_nand_*). */
++    CASE_32_64(nor)      /* Optional (TCG_TARGET_HAS_nor_*). */
++    CASE_32_64(shl)
++    CASE_32_64(shr)
++    CASE_32_64(sar)
++    CASE_32_64(rotl)     /* Optional (TCG_TARGET_HAS_rot_*). */
++    CASE_32_64(rotr)     /* Optional (TCG_TARGET_HAS_rot_*). */
++    CASE_32_64(div)      /* Optional (TCG_TARGET_HAS_div_*). */
++    CASE_32_64(divu)     /* Optional (TCG_TARGET_HAS_div_*). */
++    CASE_32_64(rem)      /* Optional (TCG_TARGET_HAS_div_*). */
++    CASE_32_64(remu)     /* Optional (TCG_TARGET_HAS_div_*). */
+         tcg_out_r(s, args[0]);
+         tcg_out_r(s, args[1]);
+         tcg_out_r(s, args[2]);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+         break;
+ #if TCG_TARGET_REG_BITS == 64
+-    case INDEX_op_add_i64:
+-    case INDEX_op_sub_i64:
+-    case INDEX_op_mul_i64:
+-    case INDEX_op_and_i64:
+-    case INDEX_op_andc_i64:     /* Optional (TCG_TARGET_HAS_andc_i64). */
+-    case INDEX_op_eqv_i64:      /* Optional (TCG_TARGET_HAS_eqv_i64). */
+-    case INDEX_op_nand_i64:     /* Optional (TCG_TARGET_HAS_nand_i64). */
+-    case INDEX_op_nor_i64:      /* Optional (TCG_TARGET_HAS_nor_i64). */
+-    case INDEX_op_or_i64:
+-    case INDEX_op_orc_i64:      /* Optional (TCG_TARGET_HAS_orc_i64). */
+-    case INDEX_op_xor_i64:
+-    case INDEX_op_shl_i64:
+-    case INDEX_op_shr_i64:
+-    case INDEX_op_sar_i64:
+-    case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+-    case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+-    case INDEX_op_div_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
+-    case INDEX_op_divu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
+-    case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
+-    case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
+-        tcg_out_r(s, args[0]);
+-        tcg_out_r(s, args[1]);
+-        tcg_out_r(s, args[2]);
+-        break;
+     case INDEX_op_deposit_i64:  /* Optional (TCG_TARGET_HAS_deposit_i64). */
+         tcg_out_r(s, args[0]);
+         tcg_out_r(s, args[1]);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+         tcg_out_r(s, args[0]);
+         tcg_out_r(s, args[1]);
+         break;
+-    case INDEX_op_div_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
+-    case INDEX_op_divu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
+-    case INDEX_op_rem_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
+-    case INDEX_op_remu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
+-        tcg_out_r(s, args[0]);
+-        tcg_out_r(s, args[1]);
+-        tcg_out_r(s, args[2]);
+-        break;
++
+ #if TCG_TARGET_REG_BITS == 32
+     case INDEX_op_add2_i32:
+     case INDEX_op_sub2_i32:
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+         }
+         tcg_out_i(s, *args++);
+         break;
++
+     case INDEX_op_mb:
+         break;
++
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+--
+.25.1

-New patch
+[PATCH 08/27] tcg/tci: Merge identical cases in generation (exchange opcodes)
+Use CASE_32_64 and CASE_64 to reduce ifdefs and merge
+cases that are identical between 32-bit and 64-bit hosts.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Message-Id: <20210217202036.1724901-5-richard.henderson@linaro.org>
+[PMD: Split patch as 2/5]
+Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Message-Id: <20210218232840.1760806-3-f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci/tcg-target.c.inc | 35 ++++++++++++++---------------------
+file changed, 14 insertions(+), 21 deletions(-)
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.c.inc
++++ b/tcg/tci/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+         tcg_out8(s, args[2]);           /* condition */
+         tci_out_label(s, arg_label(args[3]));
+         break;
+-    case INDEX_op_bswap16_i64:  /* Optional (TCG_TARGET_HAS_bswap16_i64). */
+-    case INDEX_op_bswap32_i64:  /* Optional (TCG_TARGET_HAS_bswap32_i64). */
+-    case INDEX_op_bswap64_i64:  /* Optional (TCG_TARGET_HAS_bswap64_i64). */
+-    case INDEX_op_not_i64:      /* Optional (TCG_TARGET_HAS_not_i64). */
+-    case INDEX_op_neg_i64:      /* Optional (TCG_TARGET_HAS_neg_i64). */
+-    case INDEX_op_ext8s_i64:    /* Optional (TCG_TARGET_HAS_ext8s_i64). */
+-    case INDEX_op_ext8u_i64:    /* Optional (TCG_TARGET_HAS_ext8u_i64). */
+-    case INDEX_op_ext16s_i64:   /* Optional (TCG_TARGET_HAS_ext16s_i64). */
+-    case INDEX_op_ext16u_i64:   /* Optional (TCG_TARGET_HAS_ext16u_i64). */
+-    case INDEX_op_ext32s_i64:   /* Optional (TCG_TARGET_HAS_ext32s_i64). */
+-    case INDEX_op_ext32u_i64:   /* Optional (TCG_TARGET_HAS_ext32u_i64). */
+-    case INDEX_op_ext_i32_i64:
+-    case INDEX_op_extu_i32_i64:
+ #endif /* TCG_TARGET_REG_BITS == 64 */
+-    case INDEX_op_neg_i32:      /* Optional (TCG_TARGET_HAS_neg_i32). */
+-    case INDEX_op_not_i32:      /* Optional (TCG_TARGET_HAS_not_i32). */
+-    case INDEX_op_ext8s_i32:    /* Optional (TCG_TARGET_HAS_ext8s_i32). */
+-    case INDEX_op_ext16s_i32:   /* Optional (TCG_TARGET_HAS_ext16s_i32). */
+-    case INDEX_op_ext8u_i32:    /* Optional (TCG_TARGET_HAS_ext8u_i32). */
+-    case INDEX_op_ext16u_i32:   /* Optional (TCG_TARGET_HAS_ext16u_i32). */
+-    case INDEX_op_bswap16_i32:  /* Optional (TCG_TARGET_HAS_bswap16_i32). */
+-    case INDEX_op_bswap32_i32:  /* Optional (TCG_TARGET_HAS_bswap32_i32). */
++
++    CASE_32_64(neg)      /* Optional (TCG_TARGET_HAS_neg_*). */
++    CASE_32_64(not)      /* Optional (TCG_TARGET_HAS_not_*). */
++    CASE_32_64(ext8s)    /* Optional (TCG_TARGET_HAS_ext8s_*). */
++    CASE_32_64(ext8u)    /* Optional (TCG_TARGET_HAS_ext8u_*). */
++    CASE_32_64(ext16s)   /* Optional (TCG_TARGET_HAS_ext16s_*). */
++    CASE_32_64(ext16u)   /* Optional (TCG_TARGET_HAS_ext16u_*). */
++    CASE_64(ext32s)      /* Optional (TCG_TARGET_HAS_ext32s_i64). */
++    CASE_64(ext32u)      /* Optional (TCG_TARGET_HAS_ext32u_i64). */
++    CASE_64(ext_i32)
++    CASE_64(extu_i32)
++    CASE_32_64(bswap16)  /* Optional (TCG_TARGET_HAS_bswap16_*). */
++    CASE_32_64(bswap32)  /* Optional (TCG_TARGET_HAS_bswap32_*). */
++    CASE_64(bswap64)     /* Optional (TCG_TARGET_HAS_bswap64_i64). */
+         tcg_out_r(s, args[0]);
+         tcg_out_r(s, args[1]);
+         break;
+--
+.25.1

-New patch
+[PATCH 09/27] tcg/tci: Merge identical cases in generation (deposit opcode)
+Use CASE_32_64 and CASE_64 to reduce ifdefs and merge
+cases that are identical between 32-bit and 64-bit hosts.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Message-Id: <20210217202036.1724901-5-richard.henderson@linaro.org>
+[PMD: Split patch as 3/5]
+Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Message-Id: <20210218232840.1760806-4-f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci/tcg-target.c.inc | 12 ++----------
+file changed, 2 insertions(+), 10 deletions(-)
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.c.inc
++++ b/tcg/tci/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+         tcg_out_r(s, args[1]);
+         tcg_out_r(s, args[2]);
+         break;
+-    case INDEX_op_deposit_i32:  /* Optional (TCG_TARGET_HAS_deposit_i32). */
++
++    CASE_32_64(deposit)  /* Optional (TCG_TARGET_HAS_deposit_*). */
+         tcg_out_r(s, args[0]);
+         tcg_out_r(s, args[1]);
+         tcg_out_r(s, args[2]);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+         break;
+ #if TCG_TARGET_REG_BITS == 64
+-    case INDEX_op_deposit_i64:  /* Optional (TCG_TARGET_HAS_deposit_i64). */
+-        tcg_out_r(s, args[0]);
+-        tcg_out_r(s, args[1]);
+-        tcg_out_r(s, args[2]);
+-        tcg_debug_assert(args[3] <= UINT8_MAX);
+-        tcg_out8(s, args[3]);
+-        tcg_debug_assert(args[4] <= UINT8_MAX);
+-        tcg_out8(s, args[4]);
+-        break;
+     case INDEX_op_brcond_i64:
+         tcg_out_r(s, args[0]);
+         tcg_out_r(s, args[1]);
+--
+.25.1

-New patch
+[PATCH 10/27] tcg/tci: Merge identical cases in generation (conditional opcodes)
+Use CASE_32_64 and CASE_64 to reduce ifdefs and merge
+cases that are identical between 32-bit and 64-bit hosts.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Message-Id: <20210217202036.1724901-5-richard.henderson@linaro.org>
+[PMD: Split patch as 4/5]
+Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Message-Id: <20210218232840.1760806-5-f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci/tcg-target.c.inc | 23 ++++++-----------------
+file changed, 6 insertions(+), 17 deletions(-)
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.c.inc
++++ b/tcg/tci/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+         }
+         set_jmp_reset_offset(s, args[0]);
+         break;
++
+     case INDEX_op_br:
+         tci_out_label(s, arg_label(args[0]));
+         break;
+-    case INDEX_op_setcond_i32:
++
++    CASE_32_64(setcond)
+         tcg_out_r(s, args[0]);
+         tcg_out_r(s, args[1]);
+         tcg_out_r(s, args[2]);
+         tcg_out8(s, args[3]);   /* condition */
+         break;
++
+ #if TCG_TARGET_REG_BITS == 32
+     case INDEX_op_setcond2_i32:
+         /* setcond2_i32 cond, t0, t1_low, t1_high, t2_low, t2_high */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+         tcg_out_r(s, args[4]);
+         tcg_out8(s, args[5]);   /* condition */
+         break;
+-#elif TCG_TARGET_REG_BITS == 64
+-    case INDEX_op_setcond_i64:
+-        tcg_out_r(s, args[0]);
+-        tcg_out_r(s, args[1]);
+-        tcg_out_r(s, args[2]);
+-        tcg_out8(s, args[3]);   /* condition */
+-        break;
+ #endif
+     case INDEX_op_ld8u_i32:
+     case INDEX_op_ld8s_i32:
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+         tcg_out8(s, args[4]);
+         break;
+-#if TCG_TARGET_REG_BITS == 64
+-    case INDEX_op_brcond_i64:
++    CASE_32_64(brcond)
+         tcg_out_r(s, args[0]);
+         tcg_out_r(s, args[1]);
+         tcg_out8(s, args[2]);           /* condition */
+         tci_out_label(s, arg_label(args[3]));
+         break;
+-#endif /* TCG_TARGET_REG_BITS == 64 */
+     CASE_32_64(neg)      /* Optional (TCG_TARGET_HAS_neg_*). */
+     CASE_32_64(not)      /* Optional (TCG_TARGET_HAS_not_*). */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+         tcg_out_r(s, args[3]);
+         break;
+ #endif
+-    case INDEX_op_brcond_i32:
+-        tcg_out_r(s, args[0]);
+-        tcg_out_r(s, args[1]);
+-        tcg_out8(s, args[2]);           /* condition */
+-        tci_out_label(s, arg_label(args[3]));
+-        break;
++
+     case INDEX_op_qemu_ld_i32:
+         tcg_out_r(s, *args++);
+         tcg_out_r(s, *args++);
+--
+.25.1

-New patch
+[PATCH 11/27] tcg/tci: Merge identical cases in generation (load/store opcodes)
+Use CASE_32_64 and CASE_64 to reduce ifdefs and merge
+cases that are identical between 32-bit and 64-bit hosts.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Message-Id: <20210217202036.1724901-5-richard.henderson@linaro.org>
+[PMD: Split patch as 5/5]
+Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Message-Id: <20210218232840.1760806-6-f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci/tcg-target.c.inc | 49 ++++++++++++----------------------------
+file changed, 14 insertions(+), 35 deletions(-)
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.c.inc
++++ b/tcg/tci/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+         tcg_out8(s, args[5]);   /* condition */
+         break;
+ #endif
+-    case INDEX_op_ld8u_i32:
+-    case INDEX_op_ld8s_i32:
+-    case INDEX_op_ld16u_i32:
+-    case INDEX_op_ld16s_i32:
++
++    CASE_32_64(ld8u)
++    CASE_32_64(ld8s)
++    CASE_32_64(ld16u)
++    CASE_32_64(ld16s)
+     case INDEX_op_ld_i32:
+-    case INDEX_op_st8_i32:
+-    case INDEX_op_st16_i32:
++    CASE_64(ld32u)
++    CASE_64(ld32s)
++    CASE_64(ld)
++    CASE_32_64(st8)
++    CASE_32_64(st16)
+     case INDEX_op_st_i32:
+-    case INDEX_op_ld8u_i64:
+-    case INDEX_op_ld8s_i64:
+-    case INDEX_op_ld16u_i64:
+-    case INDEX_op_ld16s_i64:
+-    case INDEX_op_ld32u_i64:
+-    case INDEX_op_ld32s_i64:
+-    case INDEX_op_ld_i64:
+-    case INDEX_op_st8_i64:
+-    case INDEX_op_st16_i64:
+-    case INDEX_op_st32_i64:
+-    case INDEX_op_st_i64:
++    CASE_64(st32)
++    CASE_64(st)
+         stack_bounds_check(args[1], args[2]);
+         tcg_out_r(s, args[0]);
+         tcg_out_r(s, args[1]);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+ #endif
+     case INDEX_op_qemu_ld_i32:
+-        tcg_out_r(s, *args++);
+-        tcg_out_r(s, *args++);
+-        if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+-            tcg_out_r(s, *args++);
+-        }
+-        tcg_out_i(s, *args++);
+-        break;
+-    case INDEX_op_qemu_ld_i64:
+-        tcg_out_r(s, *args++);
+-        if (TCG_TARGET_REG_BITS == 32) {
+-            tcg_out_r(s, *args++);
+-        }
+-        tcg_out_r(s, *args++);
+-        if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+-            tcg_out_r(s, *args++);
+-        }
+-        tcg_out_i(s, *args++);
+-        break;
+     case INDEX_op_qemu_st_i32:
+         tcg_out_r(s, *args++);
+         tcg_out_r(s, *args++);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+         }
+         tcg_out_i(s, *args++);
+         break;
++
++    case INDEX_op_qemu_ld_i64:
+     case INDEX_op_qemu_st_i64:
+         tcg_out_r(s, *args++);
+         if (TCG_TARGET_REG_BITS == 32) {
+--
+.25.1

-New patch
+[PATCH 12/27] tcg/tci: Remove tci_read_r8
+Use explicit casts for ext8u opcodes, and allow truncation
+to happen with the store for st8 opcodes.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci.c | 23 +++++------------------
+file changed, 5 insertions(+), 18 deletions(-)
+diff --git a/tcg/tci.c b/tcg/tci.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci.c
++++ b/tcg/tci.c
+@@ -XXX,XX +XXX,XX @@ static int32_t tci_read_reg32s(const tcg_target_ulong *regs, TCGReg index)
+ }
+ #endif
+-static uint8_t tci_read_reg8(const tcg_target_ulong *regs, TCGReg index)
+-{
+-    return (uint8_t)tci_read_reg(regs, index);
+-}
+-
+ static uint16_t tci_read_reg16(const tcg_target_ulong *regs, TCGReg index)
+ {
+     return (uint16_t)tci_read_reg(regs, index);
+@@ -XXX,XX +XXX,XX @@ tci_read_r(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
+     return value;
+ }
+-/* Read indexed register (8 bit) from bytecode. */
+-static uint8_t tci_read_r8(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
+-{
+-    uint8_t value = tci_read_reg8(regs, **tb_ptr);
+-    *tb_ptr += 1;
+-    return value;
+-}
+-
+ #if TCG_TARGET_HAS_ext8s_i32 || TCG_TARGET_HAS_ext8s_i64
+ /* Read indexed register (8 bit signed) from bytecode. */
+ static int8_t tci_read_r8s(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             tci_write_reg(regs, t0, *(uint32_t *)(t1 + t2));
+             break;
+         CASE_32_64(st8)
+-            t0 = tci_read_r8(regs, &tb_ptr);
++            t0 = tci_read_r(regs, &tb_ptr);
+             t1 = tci_read_r(regs, &tb_ptr);
+             t2 = tci_read_s32(&tb_ptr);
+             *(uint8_t *)(t1 + t2) = t0;
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+ #if TCG_TARGET_HAS_ext8u_i32
+         case INDEX_op_ext8u_i32:
+             t0 = *tb_ptr++;
+-            t1 = tci_read_r8(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1);
++            t1 = tci_read_r(regs, &tb_ptr);
++            tci_write_reg(regs, t0, (uint8_t)t1);
+             break;
+ #endif
+ #if TCG_TARGET_HAS_ext16u_i32
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+ #if TCG_TARGET_HAS_ext8u_i64
+         case INDEX_op_ext8u_i64:
+             t0 = *tb_ptr++;
+-            t1 = tci_read_r8(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1);
++            t1 = tci_read_r(regs, &tb_ptr);
++            tci_write_reg(regs, t0, (uint8_t)t1);
+             break;
+ #endif
+ #if TCG_TARGET_HAS_ext8s_i64
+--
+.25.1

-New patch
+[PATCH 13/27] tcg/tci: Remove tci_read_r8s
+Use explicit casts for ext8s opcodes.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci.c | 25 ++++---------------------
+file changed, 4 insertions(+), 21 deletions(-)
+diff --git a/tcg/tci.c b/tcg/tci.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci.c
++++ b/tcg/tci.c
+@@ -XXX,XX +XXX,XX @@ static tcg_target_ulong tci_read_reg(const tcg_target_ulong *regs, TCGReg index)
+     return regs[index];
+ }
+-#if TCG_TARGET_HAS_ext8s_i32 || TCG_TARGET_HAS_ext8s_i64
+-static int8_t tci_read_reg8s(const tcg_target_ulong *regs, TCGReg index)
+-{
+-    return (int8_t)tci_read_reg(regs, index);
+-}
+-#endif
+-
+ #if TCG_TARGET_HAS_ext16s_i32 || TCG_TARGET_HAS_ext16s_i64
+ static int16_t tci_read_reg16s(const tcg_target_ulong *regs, TCGReg index)
+ {
+@@ -XXX,XX +XXX,XX @@ tci_read_r(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
+     return value;
+ }
+-#if TCG_TARGET_HAS_ext8s_i32 || TCG_TARGET_HAS_ext8s_i64
+-/* Read indexed register (8 bit signed) from bytecode. */
+-static int8_t tci_read_r8s(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
+-{
+-    int8_t value = tci_read_reg8s(regs, **tb_ptr);
+-    *tb_ptr += 1;
+-    return value;
+-}
+-#endif
+-
+ /* Read indexed register (16 bit) from bytecode. */
+ static uint16_t tci_read_r16(const tcg_target_ulong *regs,
+                              const uint8_t **tb_ptr)
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+ #if TCG_TARGET_HAS_ext8s_i32
+         case INDEX_op_ext8s_i32:
+             t0 = *tb_ptr++;
+-            t1 = tci_read_r8s(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1);
++            t1 = tci_read_r(regs, &tb_ptr);
++            tci_write_reg(regs, t0, (int8_t)t1);
+             break;
+ #endif
+ #if TCG_TARGET_HAS_ext16s_i32
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+ #if TCG_TARGET_HAS_ext8s_i64
+         case INDEX_op_ext8s_i64:
+             t0 = *tb_ptr++;
+-            t1 = tci_read_r8s(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1);
++            t1 = tci_read_r(regs, &tb_ptr);
++            tci_write_reg(regs, t0, (int8_t)t1);
+             break;
+ #endif
+ #if TCG_TARGET_HAS_ext16s_i64
+--
+.25.1

-New patch
+[PATCH 14/27] tcg/tci: Remove tci_read_r16
+Use explicit casts for ext16u opcodes, and allow truncation
+to happen with the store for st16 opcodes, and with the call
+for bswap16 opcodes.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci.c | 28 +++++++---------------------
+file changed, 7 insertions(+), 21 deletions(-)
+diff --git a/tcg/tci.c b/tcg/tci.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci.c
++++ b/tcg/tci.c
+@@ -XXX,XX +XXX,XX @@ static int32_t tci_read_reg32s(const tcg_target_ulong *regs, TCGReg index)
+ }
+ #endif
+-static uint16_t tci_read_reg16(const tcg_target_ulong *regs, TCGReg index)
+-{
+-    return (uint16_t)tci_read_reg(regs, index);
+-}
+-
+ static uint32_t tci_read_reg32(const tcg_target_ulong *regs, TCGReg index)
+ {
+     return (uint32_t)tci_read_reg(regs, index);
+@@ -XXX,XX +XXX,XX @@ tci_read_r(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
+     return value;
+ }
+-/* Read indexed register (16 bit) from bytecode. */
+-static uint16_t tci_read_r16(const tcg_target_ulong *regs,
+-                             const uint8_t **tb_ptr)
+-{
+-    uint16_t value = tci_read_reg16(regs, **tb_ptr);
+-    *tb_ptr += 1;
+-    return value;
+-}
+-
+ #if TCG_TARGET_HAS_ext16s_i32 || TCG_TARGET_HAS_ext16s_i64
+ /* Read indexed register (16 bit signed) from bytecode. */
+ static int16_t tci_read_r16s(const tcg_target_ulong *regs,
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             *(uint8_t *)(t1 + t2) = t0;
+             break;
+         CASE_32_64(st16)
+-            t0 = tci_read_r16(regs, &tb_ptr);
++            t0 = tci_read_r(regs, &tb_ptr);
+             t1 = tci_read_r(regs, &tb_ptr);
+             t2 = tci_read_s32(&tb_ptr);
+             *(uint16_t *)(t1 + t2) = t0;
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+ #if TCG_TARGET_HAS_ext16u_i32
+         case INDEX_op_ext16u_i32:
+             t0 = *tb_ptr++;
+-            t1 = tci_read_r16(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1);
++            t1 = tci_read_r(regs, &tb_ptr);
++            tci_write_reg(regs, t0, (uint16_t)t1);
+             break;
+ #endif
+ #if TCG_TARGET_HAS_bswap16_i32
+         case INDEX_op_bswap16_i32:
+             t0 = *tb_ptr++;
+-            t1 = tci_read_r16(regs, &tb_ptr);
++            t1 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, bswap16(t1));
+             break;
+ #endif
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+ #if TCG_TARGET_HAS_ext16u_i64
+         case INDEX_op_ext16u_i64:
+             t0 = *tb_ptr++;
+-            t1 = tci_read_r16(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1);
++            t1 = tci_read_r(regs, &tb_ptr);
++            tci_write_reg(regs, t0, (uint16_t)t1);
+             break;
+ #endif
+ #if TCG_TARGET_HAS_ext32s_i64
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+ #if TCG_TARGET_HAS_bswap16_i64
+         case INDEX_op_bswap16_i64:
+             t0 = *tb_ptr++;
+-            t1 = tci_read_r16(regs, &tb_ptr);
++            t1 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, bswap16(t1));
+             break;
+ #endif
+--
+.25.1

-New patch
+[PATCH 15/27] tcg/tci: Remove tci_read_r16s
+Use explicit casts for ext16s opcodes.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci.c | 26 ++++----------------------
+file changed, 4 insertions(+), 22 deletions(-)
+diff --git a/tcg/tci.c b/tcg/tci.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci.c
++++ b/tcg/tci.c
+@@ -XXX,XX +XXX,XX @@ static tcg_target_ulong tci_read_reg(const tcg_target_ulong *regs, TCGReg index)
+     return regs[index];
+ }
+-#if TCG_TARGET_HAS_ext16s_i32 || TCG_TARGET_HAS_ext16s_i64
+-static int16_t tci_read_reg16s(const tcg_target_ulong *regs, TCGReg index)
+-{
+-    return (int16_t)tci_read_reg(regs, index);
+-}
+-#endif
+-
+ #if TCG_TARGET_REG_BITS == 64
+ static int32_t tci_read_reg32s(const tcg_target_ulong *regs, TCGReg index)
+ {
+@@ -XXX,XX +XXX,XX @@ tci_read_r(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
+     return value;
+ }
+-#if TCG_TARGET_HAS_ext16s_i32 || TCG_TARGET_HAS_ext16s_i64
+-/* Read indexed register (16 bit signed) from bytecode. */
+-static int16_t tci_read_r16s(const tcg_target_ulong *regs,
+-                             const uint8_t **tb_ptr)
+-{
+-    int16_t value = tci_read_reg16s(regs, **tb_ptr);
+-    *tb_ptr += 1;
+-    return value;
+-}
+-#endif
+-
+ /* Read indexed register (32 bit) from bytecode. */
+ static uint32_t tci_read_r32(const tcg_target_ulong *regs,
+                              const uint8_t **tb_ptr)
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+ #if TCG_TARGET_HAS_ext16s_i32
+         case INDEX_op_ext16s_i32:
+             t0 = *tb_ptr++;
+-            t1 = tci_read_r16s(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1);
++            t1 = tci_read_r(regs, &tb_ptr);
++            tci_write_reg(regs, t0, (int16_t)t1);
+             break;
+ #endif
+ #if TCG_TARGET_HAS_ext8u_i32
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+ #if TCG_TARGET_HAS_ext16s_i64
+         case INDEX_op_ext16s_i64:
+             t0 = *tb_ptr++;
+-            t1 = tci_read_r16s(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1);
++            t1 = tci_read_r(regs, &tb_ptr);
++            tci_write_reg(regs, t0, (int16_t)t1);
+             break;
+ #endif
+ #if TCG_TARGET_HAS_ext16u_i64
+--
+.25.1

-New patch
+[PATCH 16/27] tcg/tci: Remove tci_read_r32
+Use explicit casts for ext32u opcodes, and allow truncation
 to happen for other users.
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/tci.c | 122 ++++++++++++++++++++++++------------------------------
 file changed, 54 insertions(+), 68 deletions(-)
 diff --git a/tcg/tci.c b/tcg/tci.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci.c
 +++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ static int32_t tci_read_reg32s(const tcg_target_ulong *regs, TCGReg index)
  }
  #endif
 -static uint32_t tci_read_reg32(const tcg_target_ulong *regs, TCGReg index)
 -{
 -    return (uint32_t)tci_read_reg(regs, index);
 -}
 -
  #if TCG_TARGET_REG_BITS == 64
  static uint64_t tci_read_reg64(const tcg_target_ulong *regs, TCGReg index)
  {
@@ -XXX,XX +XXX,XX @@ tci_read_r(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
      return value;
  }
 -/* Read indexed register (32 bit) from bytecode. */
 -static uint32_t tci_read_r32(const tcg_target_ulong *regs,
 -                             const uint8_t **tb_ptr)
 -{
 -    uint32_t value = tci_read_reg32(regs, **tb_ptr);
 -    *tb_ptr += 1;
 -    return value;
 -}
 -
  #if TCG_TARGET_REG_BITS == 32
  /* Read two indexed registers (2 * 32 bit) from bytecode. */
  static uint64_t tci_read_r64(const tcg_target_ulong *regs,
                               const uint8_t **tb_ptr)
  {
 -    uint32_t low = tci_read_r32(regs, tb_ptr);
 -    return tci_uint64(tci_read_r32(regs, tb_ptr), low);
 +    uint32_t low = tci_read_r(regs, tb_ptr);
 +    return tci_uint64(tci_read_r(regs, tb_ptr), low);
  }
  #elif TCG_TARGET_REG_BITS == 64
  /* Read indexed register (32 bit signed) from bytecode. */
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              continue;
          case INDEX_op_setcond_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              condition = *tb_ptr++;
              tci_write_reg(regs, t0, tci_compare32(t1, t2, condition));
              break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #endif
          case INDEX_op_mov_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1);
              break;
          case INDEX_op_tci_movi_i32:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              break;
          case INDEX_op_st_i32:
          CASE_64(st32)
 -            t0 = tci_read_r32(regs, &tb_ptr);
 +            t0 = tci_read_r(regs, &tb_ptr);
              t1 = tci_read_r(regs, &tb_ptr);
              t2 = tci_read_s32(&tb_ptr);
              *(uint32_t *)(t1 + t2) = t0;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_add_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 + t2);
              break;
          case INDEX_op_sub_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 - t2);
              break;
          case INDEX_op_mul_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 * t2);
              break;
          case INDEX_op_div_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, (int32_t)t1 / (int32_t)t2);
              break;
          case INDEX_op_divu_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1 / t2);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (uint32_t)t1 / (uint32_t)t2);
              break;
          case INDEX_op_rem_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, (int32_t)t1 % (int32_t)t2);
              break;
          case INDEX_op_remu_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1 % t2);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (uint32_t)t1 % (uint32_t)t2);
              break;
          case INDEX_op_and_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 & t2);
              break;
          case INDEX_op_or_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 | t2);
              break;
          case INDEX_op_xor_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 ^ t2);
              break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_shl_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1 << (t2 & 31));
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (uint32_t)t1 << (t2 & 31));
              break;
          case INDEX_op_shr_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1 >> (t2 & 31));
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (uint32_t)t1 >> (t2 & 31));
              break;
          case INDEX_op_sar_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, ((int32_t)t1 >> (t2 & 31)));
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (int32_t)t1 >> (t2 & 31));
              break;
  #if TCG_TARGET_HAS_rot_i32
          case INDEX_op_rotl_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, rol32(t1, t2 & 31));
              break;
          case INDEX_op_rotr_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, ror32(t1, t2 & 31));
              break;
  #endif
  #if TCG_TARGET_HAS_deposit_i32
          case INDEX_op_deposit_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tmp16 = *tb_ptr++;
              tmp8 = *tb_ptr++;
              tmp32 = (((1 << tmp8) - 1) << tmp16);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              break;
  #endif
          case INDEX_op_brcond_i32:
 -            t0 = tci_read_r32(regs, &tb_ptr);
 -            t1 = tci_read_r32(regs, &tb_ptr);
 +            t0 = tci_read_r(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              condition = *tb_ptr++;
              label = tci_read_label(&tb_ptr);
              if (tci_compare32(t0, t1, condition)) {
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_mulu2_i32:
              t0 = *tb_ptr++;
              t1 = *tb_ptr++;
 -            t2 = tci_read_r32(regs, &tb_ptr);
 -            tmp64 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg64(regs, t1, t0, t2 * tmp64);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tmp64 = (uint32_t)tci_read_r(regs, &tb_ptr);
 +            tci_write_reg64(regs, t1, t0, (uint32_t)t2 * tmp64);
              break;
  #endif /* TCG_TARGET_REG_BITS == 32 */
  #if TCG_TARGET_HAS_ext8s_i32
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #if TCG_TARGET_HAS_bswap32_i32
          case INDEX_op_bswap32_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, bswap32(t1));
              break;
  #endif
  #if TCG_TARGET_HAS_not_i32
          case INDEX_op_not_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, ~t1);
              break;
  #endif
  #if TCG_TARGET_HAS_neg_i32
          case INDEX_op_neg_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, -t1);
              break;
  #endif
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #endif
          case INDEX_op_extu_i32_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (uint32_t)t1);
              break;
  #if TCG_TARGET_HAS_bswap16_i64
          case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #if TCG_TARGET_HAS_bswap32_i64
          case INDEX_op_bswap32_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, bswap32(t1));
              break;
  #endif
 --
 .25.1

-New patch
+[PATCH 17/27] tcg/tci: Remove tci_read_r32s
+Use explicit casts for ext32s opcodes.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci.c | 20 ++------------------
+file changed, 2 insertions(+), 18 deletions(-)
+diff --git a/tcg/tci.c b/tcg/tci.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci.c
++++ b/tcg/tci.c
+@@ -XXX,XX +XXX,XX @@ static tcg_target_ulong tci_read_reg(const tcg_target_ulong *regs, TCGReg index)
+     return regs[index];
+ }
+-#if TCG_TARGET_REG_BITS == 64
+-static int32_t tci_read_reg32s(const tcg_target_ulong *regs, TCGReg index)
+-{
+-    return (int32_t)tci_read_reg(regs, index);
+-}
+-#endif
+-
+ #if TCG_TARGET_REG_BITS == 64
+ static uint64_t tci_read_reg64(const tcg_target_ulong *regs, TCGReg index)
+ {
+@@ -XXX,XX +XXX,XX @@ static uint64_t tci_read_r64(const tcg_target_ulong *regs,
+     return tci_uint64(tci_read_r(regs, tb_ptr), low);
+ }
+ #elif TCG_TARGET_REG_BITS == 64
+-/* Read indexed register (32 bit signed) from bytecode. */
+-static int32_t tci_read_r32s(const tcg_target_ulong *regs,
+-                             const uint8_t **tb_ptr)
+-{
+-    int32_t value = tci_read_reg32s(regs, **tb_ptr);
+-    *tb_ptr += 1;
+-    return value;
+-}
+-
+ /* Read indexed register (64 bit) from bytecode. */
+ static uint64_t tci_read_r64(const tcg_target_ulong *regs,
+                              const uint8_t **tb_ptr)
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+ #endif
+         case INDEX_op_ext_i32_i64:
+             t0 = *tb_ptr++;
+-            t1 = tci_read_r32s(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1);
++            t1 = tci_read_r(regs, &tb_ptr);
++            tci_write_reg(regs, t0, (int32_t)t1);
+             break;
+ #if TCG_TARGET_HAS_ext32u_i64
+         case INDEX_op_ext32u_i64:
+--
+.25.1

-New patch
+[PATCH 18/27] tcg/tci: Reduce use of tci_read_r64
+In all cases restricted to 64-bit hosts, tcg_read_r is
 identical.  We retain the 64-bit symbol for the single
 case of INDEX_op_qemu_st_i64.
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/tci.c | 93 +++++++++++++++++++++++++------------------------------
 file changed, 42 insertions(+), 51 deletions(-)
 diff --git a/tcg/tci.c b/tcg/tci.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci.c
 +++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ static tcg_target_ulong tci_read_reg(const tcg_target_ulong *regs, TCGReg index)
      return regs[index];
  }
 -#if TCG_TARGET_REG_BITS == 64
 -static uint64_t tci_read_reg64(const tcg_target_ulong *regs, TCGReg index)
 -{
 -    return tci_read_reg(regs, index);
 -}
 -#endif
 -
  static void
  tci_write_reg(tcg_target_ulong *regs, TCGReg index, tcg_target_ulong value)
  {
@@ -XXX,XX +XXX,XX @@ static uint64_t tci_read_r64(const tcg_target_ulong *regs,
  static uint64_t tci_read_r64(const tcg_target_ulong *regs,
                               const uint8_t **tb_ptr)
  {
 -    uint64_t value = tci_read_reg64(regs, **tb_ptr);
 -    *tb_ptr += 1;
 -    return value;
 +    return tci_read_r(regs, tb_ptr);
  }
  #endif
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #elif TCG_TARGET_REG_BITS == 64
          case INDEX_op_setcond_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              condition = *tb_ptr++;
              tci_write_reg(regs, t0, tci_compare64(t1, t2, condition));
              break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #if TCG_TARGET_REG_BITS == 64
          case INDEX_op_mov_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1);
              break;
          case INDEX_op_tci_movi_i64:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              tci_write_reg(regs, t0, *(uint64_t *)(t1 + t2));
              break;
          case INDEX_op_st_i64:
 -            t0 = tci_read_r64(regs, &tb_ptr);
 +            t0 = tci_read_r(regs, &tb_ptr);
              t1 = tci_read_r(regs, &tb_ptr);
              t2 = tci_read_s32(&tb_ptr);
              *(uint64_t *)(t1 + t2) = t0;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_add_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 + t2);
              break;
          case INDEX_op_sub_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 - t2);
              break;
          case INDEX_op_mul_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 * t2);
              break;
          case INDEX_op_div_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, (int64_t)t1 / (int64_t)t2);
              break;
          case INDEX_op_divu_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, (uint64_t)t1 / (uint64_t)t2);
              break;
          case INDEX_op_rem_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, (int64_t)t1 % (int64_t)t2);
              break;
          case INDEX_op_remu_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, (uint64_t)t1 % (uint64_t)t2);
              break;
          case INDEX_op_and_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 & t2);
              break;
          case INDEX_op_or_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 | t2);
              break;
          case INDEX_op_xor_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 ^ t2);
              break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_shl_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 << (t2 & 63));
              break;
          case INDEX_op_shr_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 >> (t2 & 63));
              break;
          case INDEX_op_sar_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, ((int64_t)t1 >> (t2 & 63)));
              break;
  #if TCG_TARGET_HAS_rot_i64
          case INDEX_op_rotl_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, rol64(t1, t2 & 63));
              break;
          case INDEX_op_rotr_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, ror64(t1, t2 & 63));
              break;
  #endif
  #if TCG_TARGET_HAS_deposit_i64
          case INDEX_op_deposit_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tmp16 = *tb_ptr++;
              tmp8 = *tb_ptr++;
              tmp64 = (((1ULL << tmp8) - 1) << tmp16);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              break;
  #endif
          case INDEX_op_brcond_i64:
 -            t0 = tci_read_r64(regs, &tb_ptr);
 -            t1 = tci_read_r64(regs, &tb_ptr);
 +            t0 = tci_read_r(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              condition = *tb_ptr++;
              label = tci_read_label(&tb_ptr);
              if (tci_compare64(t0, t1, condition)) {
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #if TCG_TARGET_HAS_bswap64_i64
          case INDEX_op_bswap64_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, bswap64(t1));
              break;
  #endif
  #if TCG_TARGET_HAS_not_i64
          case INDEX_op_not_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, ~t1);
              break;
  #endif
  #if TCG_TARGET_HAS_neg_i64
          case INDEX_op_neg_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r64(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, -t1);
              break;
  #endif
 --
 .25.1

-New patch
+[PATCH 19/27] tcg/tci: Merge basic arithmetic operations
+This includes add, sub, mul, and, or, xor.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci.c | 83 +++++++++++++++++--------------------------------------
+file changed, 25 insertions(+), 58 deletions(-)
+diff --git a/tcg/tci.c b/tcg/tci.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci.c
++++ b/tcg/tci.c
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             *(uint32_t *)(t1 + t2) = t0;
+             break;
+-            /* Arithmetic operations (32 bit). */
++            /* Arithmetic operations (mixed 32/64 bit). */
+-        case INDEX_op_add_i32:
++        CASE_32_64(add)
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+             t2 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, t1 + t2);
+             break;
+-        case INDEX_op_sub_i32:
++        CASE_32_64(sub)
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+             t2 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, t1 - t2);
+             break;
+-        case INDEX_op_mul_i32:
++        CASE_32_64(mul)
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+             t2 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, t1 * t2);
+             break;
++        CASE_32_64(and)
++            t0 = *tb_ptr++;
++            t1 = tci_read_r(regs, &tb_ptr);
++            t2 = tci_read_r(regs, &tb_ptr);
++            tci_write_reg(regs, t0, t1 & t2);
++            break;
++        CASE_32_64(or)
++            t0 = *tb_ptr++;
++            t1 = tci_read_r(regs, &tb_ptr);
++            t2 = tci_read_r(regs, &tb_ptr);
++            tci_write_reg(regs, t0, t1 | t2);
++            break;
++        CASE_32_64(xor)
++            t0 = *tb_ptr++;
++            t1 = tci_read_r(regs, &tb_ptr);
++            t2 = tci_read_r(regs, &tb_ptr);
++            tci_write_reg(regs, t0, t1 ^ t2);
++            break;
++
++            /* Arithmetic operations (32 bit). */
++
+         case INDEX_op_div_i32:
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             t2 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, (uint32_t)t1 % (uint32_t)t2);
+             break;
+-        case INDEX_op_and_i32:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            t2 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1 & t2);
+-            break;
+-        case INDEX_op_or_i32:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            t2 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1 | t2);
+-            break;
+-        case INDEX_op_xor_i32:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            t2 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1 ^ t2);
+-            break;
+             /* Shift/rotate operations (32 bit). */
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             /* Arithmetic operations (64 bit). */
+-        case INDEX_op_add_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            t2 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1 + t2);
+-            break;
+-        case INDEX_op_sub_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            t2 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1 - t2);
+-            break;
+-        case INDEX_op_mul_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            t2 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1 * t2);
+-            break;
+         case INDEX_op_div_i64:
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             t2 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, (uint64_t)t1 % (uint64_t)t2);
+             break;
+-        case INDEX_op_and_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            t2 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1 & t2);
+-            break;
+-        case INDEX_op_or_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            t2 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1 | t2);
+-            break;
+-        case INDEX_op_xor_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            t2 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1 ^ t2);
+-            break;
+             /* Shift/rotate operations (64 bit). */
+--
+.25.1

-New patch
+[PATCH 20/27] tcg/tci: Merge extension operations
+This includes ext8s, ext8u, ext16s, ext16u.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci.c | 44 ++++++++------------------------------------
+file changed, 8 insertions(+), 36 deletions(-)
+diff --git a/tcg/tci.c b/tcg/tci.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci.c
++++ b/tcg/tci.c
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             tci_write_reg64(regs, t1, t0, (uint32_t)t2 * tmp64);
+             break;
+ #endif /* TCG_TARGET_REG_BITS == 32 */
+-#if TCG_TARGET_HAS_ext8s_i32
+-        case INDEX_op_ext8s_i32:
++#if TCG_TARGET_HAS_ext8s_i32 || TCG_TARGET_HAS_ext8s_i64
++        CASE_32_64(ext8s)
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, (int8_t)t1);
+             break;
+ #endif
+-#if TCG_TARGET_HAS_ext16s_i32
+-        case INDEX_op_ext16s_i32:
++#if TCG_TARGET_HAS_ext16s_i32 || TCG_TARGET_HAS_ext16s_i64
++        CASE_32_64(ext16s)
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, (int16_t)t1);
+             break;
+ #endif
+-#if TCG_TARGET_HAS_ext8u_i32
+-        case INDEX_op_ext8u_i32:
++#if TCG_TARGET_HAS_ext8u_i32 || TCG_TARGET_HAS_ext8u_i64
++        CASE_32_64(ext8u)
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, (uint8_t)t1);
+             break;
+ #endif
+-#if TCG_TARGET_HAS_ext16u_i32
+-        case INDEX_op_ext16u_i32:
++#if TCG_TARGET_HAS_ext16u_i32 || TCG_TARGET_HAS_ext16u_i64
++        CASE_32_64(ext16u)
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, (uint16_t)t1);
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+                 continue;
+             }
+             break;
+-#if TCG_TARGET_HAS_ext8u_i64
+-        case INDEX_op_ext8u_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, (uint8_t)t1);
+-            break;
+-#endif
+-#if TCG_TARGET_HAS_ext8s_i64
+-        case INDEX_op_ext8s_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, (int8_t)t1);
+-            break;
+-#endif
+-#if TCG_TARGET_HAS_ext16s_i64
+-        case INDEX_op_ext16s_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, (int16_t)t1);
+-            break;
+-#endif
+-#if TCG_TARGET_HAS_ext16u_i64
+-        case INDEX_op_ext16u_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, (uint16_t)t1);
+-            break;
+-#endif
+ #if TCG_TARGET_HAS_ext32s_i64
+         case INDEX_op_ext32s_i64:
+ #endif
+--
+.25.1

-New patch
+[PATCH 21/27] tcg/tci: Merge bswap operations
+This includes bswap16 and bswap32.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci.c | 22 ++++------------------
+file changed, 4 insertions(+), 18 deletions(-)
+diff --git a/tcg/tci.c b/tcg/tci.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci.c
++++ b/tcg/tci.c
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             tci_write_reg(regs, t0, (uint16_t)t1);
+             break;
+ #endif
+-#if TCG_TARGET_HAS_bswap16_i32
+-        case INDEX_op_bswap16_i32:
++#if TCG_TARGET_HAS_bswap16_i32 || TCG_TARGET_HAS_bswap16_i64
++        CASE_32_64(bswap16)
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, bswap16(t1));
+             break;
+ #endif
+-#if TCG_TARGET_HAS_bswap32_i32
+-        case INDEX_op_bswap32_i32:
++#if TCG_TARGET_HAS_bswap32_i32 || TCG_TARGET_HAS_bswap32_i64
++        CASE_32_64(bswap32)
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, bswap32(t1));
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             t1 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, (uint32_t)t1);
+             break;
+-#if TCG_TARGET_HAS_bswap16_i64
+-        case INDEX_op_bswap16_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, bswap16(t1));
+-            break;
+-#endif
+-#if TCG_TARGET_HAS_bswap32_i64
+-        case INDEX_op_bswap32_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, bswap32(t1));
+-            break;
+-#endif
+ #if TCG_TARGET_HAS_bswap64_i64
+         case INDEX_op_bswap64_i64:
+             t0 = *tb_ptr++;
+--
+.25.1

-New patch
+[PATCH 22/27] tcg/tci: Merge mov, not and neg operations
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci.c | 29 +++++------------------------
+file changed, 5 insertions(+), 24 deletions(-)
+diff --git a/tcg/tci.c b/tcg/tci.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci.c
++++ b/tcg/tci.c
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             tci_write_reg(regs, t0, tci_compare64(t1, t2, condition));
+             break;
+ #endif
+-        case INDEX_op_mov_i32:
++        CASE_32_64(mov)
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, t1);
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             tci_write_reg(regs, t0, bswap32(t1));
+             break;
+ #endif
+-#if TCG_TARGET_HAS_not_i32
+-        case INDEX_op_not_i32:
++#if TCG_TARGET_HAS_not_i32 || TCG_TARGET_HAS_not_i64
++        CASE_32_64(not)
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, ~t1);
+             break;
+ #endif
+-#if TCG_TARGET_HAS_neg_i32
+-        case INDEX_op_neg_i32:
++#if TCG_TARGET_HAS_neg_i32 || TCG_TARGET_HAS_neg_i64
++        CASE_32_64(neg)
+             t0 = *tb_ptr++;
+             t1 = tci_read_r(regs, &tb_ptr);
+             tci_write_reg(regs, t0, -t1);
+             break;
+ #endif
+ #if TCG_TARGET_REG_BITS == 64
+-        case INDEX_op_mov_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, t1);
+-            break;
+         case INDEX_op_tci_movi_i64:
+             t0 = *tb_ptr++;
+             t1 = tci_read_i64(&tb_ptr);
+@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             tci_write_reg(regs, t0, bswap64(t1));
+             break;
+ #endif
+-#if TCG_TARGET_HAS_not_i64
+-        case INDEX_op_not_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, ~t1);
+-            break;
+-#endif
+-#if TCG_TARGET_HAS_neg_i64
+-        case INDEX_op_neg_i64:
+-            t0 = *tb_ptr++;
+-            t1 = tci_read_r(regs, &tb_ptr);
+-            tci_write_reg(regs, t0, -t1);
+-            break;
+-#endif
+ #endif /* TCG_TARGET_REG_BITS == 64 */
+             /* QEMU specific operations. */
+--
+.25.1

-New patch
+[PATCH 23/27] accel/tcg: rename tb_lookup__cpu_state and hoist state extraction
+From: Alex Bennée <alex.bennee@linaro.org>
+Having a function return either and valid TB and some system state
+seems excessive. It will make the subsequent re-factoring easier if we
+lookup the current state where we are.
+Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
+Message-Id: <20210224165811.11567-2-alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/exec/tb-lookup.h | 18 ++++++++----------
+ accel/tcg/cpu-exec.c     | 10 ++++++++--
+ accel/tcg/tcg-runtime.c  |  4 +++-
+files changed, 19 insertions(+), 13 deletions(-)
+diff --git a/include/exec/tb-lookup.h b/include/exec/tb-lookup.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/exec/tb-lookup.h
++++ b/include/exec/tb-lookup.h
+@@ -XXX,XX +XXX,XX @@
+ #include "exec/tb-hash.h"
+ /* Might cause an exception, so have a longjmp destination ready */
+-static inline TranslationBlock *
+-tb_lookup__cpu_state(CPUState *cpu, target_ulong *pc, target_ulong *cs_base,
+-                     uint32_t *flags, uint32_t cf_mask)
++static inline TranslationBlock * tb_lookup(CPUState *cpu,
++                                           target_ulong pc, target_ulong cs_base,
++                                           uint32_t flags, uint32_t cf_mask)
+ {
+-    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+     TranslationBlock *tb;
+     uint32_t hash;
+-    cpu_get_tb_cpu_state(env, pc, cs_base, flags);
+-    hash = tb_jmp_cache_hash_func(*pc);
++    hash = tb_jmp_cache_hash_func(pc);
+     tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
+     cf_mask &= ~CF_CLUSTER_MASK;
+     cf_mask |= cpu->cluster_index << CF_CLUSTER_SHIFT;
+     if (likely(tb &&
+-               tb->pc == *pc &&
+-               tb->cs_base == *cs_base &&
+-               tb->flags == *flags &&
++               tb->pc == pc &&
++               tb->cs_base == cs_base &&
++               tb->flags == flags &&
+                tb->trace_vcpu_dstate == *cpu->trace_dstate &&
+                (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == cf_mask)) {
+         return tb;
+     }
+-    tb = tb_htable_lookup(cpu, *pc, *cs_base, *flags, cf_mask);
++    tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask);
+     if (tb == NULL) {
+         return NULL;
+     }
+diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/cpu-exec.c
++++ b/accel/tcg/cpu-exec.c
+@@ -XXX,XX +XXX,XX @@ static void cpu_exec_exit(CPUState *cpu)
+ void cpu_exec_step_atomic(CPUState *cpu)
+ {
++    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+     TranslationBlock *tb;
+     target_ulong cs_base, pc;
+     uint32_t flags;
+@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
+         g_assert(!cpu->running);
+         cpu->running = true;
+-        tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
++        cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
++        tb = tb_lookup(cpu, pc, cs_base, flags, cf_mask);
++
+         if (tb == NULL) {
+             mmap_lock();
+             tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
+@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_find(CPUState *cpu,
+                                         TranslationBlock *last_tb,
+                                         int tb_exit, uint32_t cf_mask)
+ {
++    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+     TranslationBlock *tb;
+     target_ulong cs_base, pc;
+     uint32_t flags;
+-    tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
++    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
++
++    tb = tb_lookup(cpu, pc, cs_base, flags, cf_mask);
+     if (tb == NULL) {
+         mmap_lock();
+         tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
+diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/tcg-runtime.c
++++ b/accel/tcg/tcg-runtime.c
+@@ -XXX,XX +XXX,XX @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
+     target_ulong cs_base, pc;
+     uint32_t flags;
+-    tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, curr_cflags());
++    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
++
++    tb = tb_lookup(cpu, pc, cs_base, flags, curr_cflags());
+     if (tb == NULL) {
+         return tcg_code_gen_epilogue;
+     }
+--
+.25.1

-New patch
+[PATCH 24/27] accel/tcg: move CF_CLUSTER calculation to curr_cflags
+From: Alex Bennée <alex.bennee@linaro.org>
+There is nothing special about this compile flag that doesn't mean we
+can't just compute it with curr_cflags() which we should be using when
+building a new set.
+Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
+Message-Id: <20210224165811.11567-3-alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/exec/exec-all.h   | 8 +++++---
+ include/exec/tb-lookup.h  | 3 ---
+ accel/tcg/cpu-exec.c      | 9 ++++-----
+ accel/tcg/tcg-runtime.c   | 2 +-
+ accel/tcg/translate-all.c | 6 +++---
+ softmmu/physmem.c         | 2 +-
+files changed, 14 insertions(+), 16 deletions(-)
+diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/exec/exec-all.h
++++ b/include/exec/exec-all.h
+@@ -XXX,XX +XXX,XX @@ static inline uint32_t tb_cflags(const TranslationBlock *tb)
+ }
+ /* current cflags for hashing/comparison */
+-static inline uint32_t curr_cflags(void)
++static inline uint32_t curr_cflags(CPUState *cpu)
+ {
+-    return (parallel_cpus ? CF_PARALLEL : 0)
+-         | (icount_enabled() ? CF_USE_ICOUNT : 0);
++    uint32_t cflags = deposit32(0, CF_CLUSTER_SHIFT, 8, cpu->cluster_index);
++    cflags |= parallel_cpus ? CF_PARALLEL : 0;
++    cflags |= icount_enabled() ? CF_USE_ICOUNT : 0;
++    return cflags;
+ }
+ /* TranslationBlock invalidate API */
+diff --git a/include/exec/tb-lookup.h b/include/exec/tb-lookup.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/exec/tb-lookup.h
++++ b/include/exec/tb-lookup.h
+@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock * tb_lookup(CPUState *cpu,
+     hash = tb_jmp_cache_hash_func(pc);
+     tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
+-    cf_mask &= ~CF_CLUSTER_MASK;
+-    cf_mask |= cpu->cluster_index << CF_CLUSTER_SHIFT;
+-
+     if (likely(tb &&
+                tb->pc == pc &&
+                tb->cs_base == cs_base &&
+diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/cpu-exec.c
++++ b/accel/tcg/cpu-exec.c
+@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
+     TranslationBlock *tb;
+     target_ulong cs_base, pc;
+     uint32_t flags;
+-    uint32_t cflags = 1;
+-    uint32_t cf_mask = cflags & CF_HASH_MASK;
++    uint32_t cflags = (curr_cflags(cpu) & ~CF_PARALLEL) | 1;
+     int tb_exit;
+     if (sigsetjmp(cpu->jmp_env, 0) == 0) {
+@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
+         cpu->running = true;
+         cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+-        tb = tb_lookup(cpu, pc, cs_base, flags, cf_mask);
++        tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
+         if (tb == NULL) {
+             mmap_lock();
+@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
+         if (replay_has_exception()
+             && cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra == 0) {
+             /* Execute just one insn to trigger exception pending in the log */
+-            cpu->cflags_next_tb = (curr_cflags() & ~CF_USE_ICOUNT) | 1;
++            cpu->cflags_next_tb = (curr_cflags(cpu) & ~CF_USE_ICOUNT) | 1;
+         }
+ #endif
+         return false;
+@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
+                have CF_INVALID set, -1 is a convenient invalid value that
+                does not require tcg headers for cpu_common_reset.  */
+             if (cflags == -1) {
+-                cflags = curr_cflags();
++                cflags = curr_cflags(cpu);
+             } else {
+                 cpu->cflags_next_tb = -1;
+             }
+diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/tcg-runtime.c
++++ b/accel/tcg/tcg-runtime.c
+@@ -XXX,XX +XXX,XX @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
+     cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+-    tb = tb_lookup(cpu, pc, cs_base, flags, curr_cflags());
++    tb = tb_lookup(cpu, pc, cs_base, flags, curr_cflags(cpu));
+     if (tb == NULL) {
+         return tcg_code_gen_epilogue;
+     }
+diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/translate-all.c
++++ b/accel/tcg/translate-all.c
+@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
+     if (current_tb_modified) {
+         page_collection_unlock(pages);
+         /* Force execution of one insn next time.  */
+-        cpu->cflags_next_tb = 1 | curr_cflags();
++        cpu->cflags_next_tb = 1 | curr_cflags(cpu);
+         mmap_unlock();
+         cpu_loop_exit_noexc(cpu);
+     }
+@@ -XXX,XX +XXX,XX @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
+ #ifdef TARGET_HAS_PRECISE_SMC
+     if (current_tb_modified) {
+         /* Force execution of one insn next time.  */
+-        cpu->cflags_next_tb = 1 | curr_cflags();
++        cpu->cflags_next_tb = 1 | curr_cflags(cpu);
+         return true;
+     }
+ #endif
+@@ -XXX,XX +XXX,XX @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
+      * operations only (which execute after completion) so we don't
+      * double instrument the instruction.
+      */
+-    cpu->cflags_next_tb = curr_cflags() | CF_MEMI_ONLY | CF_LAST_IO | n;
++    cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_LAST_IO | n;
+     qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,
+                            "cpu_io_recompile: rewound execution of TB to "
+diff --git a/softmmu/physmem.c b/softmmu/physmem.c
+index XXXXXXX..XXXXXXX 100644
+--- a/softmmu/physmem.c
++++ b/softmmu/physmem.c
+@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
+                     cpu_loop_exit_restore(cpu, ra);
+                 } else {
+                     /* Force execution of one insn next time.  */
+-                    cpu->cflags_next_tb = 1 | curr_cflags();
++                    cpu->cflags_next_tb = 1 | curr_cflags(cpu);
+                     mmap_unlock();
+                     if (ra) {
+                         cpu_restore_state(cpu, ra, true);
+--
+.25.1

-[PULL 3/3] accel/tcg: rename tcg-cpus functions to match module name
+[PATCH 25/27] accel/tcg: drop the use of CF_HASH_MASK and rename params
-From: Claudio Fontana <cfontana@suse.de>
+From: Alex Bennée <alex.bennee@linaro.org>
-Signed-off-by: Claudio Fontana <cfontana@suse.de>
+We don't really deal in cf_mask most of the time. The one time it's
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+relevant is when we want to remove an invalidated TB from the QHT
-Message-Id: <20201015143217.29337-4-cfontana@suse.de>
+lookup. Everywhere else we should be looking up things without
 CF_INVALID set.
 Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
 Message-Id: <20210224165811.11567-4-alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-cpus-icount.h |  6 +--
+ include/exec/exec-all.h   |  4 +---
- accel/tcg/tcg-cpus-rr.h     |  2 +-
+ include/exec/tb-lookup.h  |  9 ++++++---
- accel/tcg/tcg-cpus.h        |  6 +--
+ accel/tcg/cpu-exec.c      | 16 ++++++++--------
- accel/tcg/tcg-cpus-icount.c | 24 ++++++------
+ accel/tcg/tcg-runtime.c   |  2 +-
- accel/tcg/tcg-cpus-mttcg.c  | 10 ++---
+ accel/tcg/translate-all.c |  8 +++++---
- accel/tcg/tcg-cpus-rr.c     | 74 ++++++++++++++++++-------------------
+files changed, 21 insertions(+), 18 deletions(-)
- accel/tcg/tcg-cpus.c        |  6 +--
-files changed, 64 insertions(+), 64 deletions(-)
+diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+index XXXXXXX..XXXXXXX 100644
-diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
+--- a/include/exec/exec-all.h
-index XXXXXXX..XXXXXXX 100644
++++ b/include/exec/exec-all.h
---- a/accel/tcg/tcg-cpus-icount.h
+@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
-+++ b/accel/tcg/tcg-cpus-icount.h
+ #define CF_PARALLEL    0x00080000 /* Generate code for a parallel context */
  #define CF_CLUSTER_MASK 0xff000000 /* Top 8 bits are cluster ID */
  #define CF_CLUSTER_SHIFT 24
 -/* cflags' mask for hashing/comparison, basically ignore CF_INVALID */
 -#define CF_HASH_MASK   (~CF_INVALID)
      /* Per-vCPU dynamic tracing state used to generate this TB */
      uint32_t trace_vcpu_dstate;
@@ -XXX,XX +XXX,XX @@ void tb_flush(CPUState *cpu);
  void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
  TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
                                     target_ulong cs_base, uint32_t flags,
 -                                   uint32_t cf_mask);
 +                                   uint32_t cflags);
  void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
  /* GETPC is the true target of the return instruction that we'll execute.  */
 diff --git a/include/exec/tb-lookup.h b/include/exec/tb-lookup.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/tb-lookup.h
 +++ b/include/exec/tb-lookup.h
 @@ -XXX,XX +XXX,XX @@
- #ifndef TCG_CPUS_ICOUNT_H
+ /* Might cause an exception, so have a longjmp destination ready */
- #define TCG_CPUS_ICOUNT_H
+ static inline TranslationBlock * tb_lookup(CPUState *cpu,
+                                            target_ulong pc, target_ulong cs_base,
--void handle_icount_deadline(void);
+-                                           uint32_t flags, uint32_t cf_mask)
--void prepare_icount_for_run(CPUState *cpu);
++                                           uint32_t flags, uint32_t cflags)
--void process_icount_data(CPUState *cpu);
+ {
-+void icount_handle_deadline(void);
+     TranslationBlock *tb;
-+void icount_prepare_for_run(CPUState *cpu);
+     uint32_t hash;
-+void icount_process_data(CPUState *cpu);
++    /* we should never be trying to look up an INVALID tb */
- #endif /* TCG_CPUS_ICOUNT_H */
++    tcg_debug_assert(!(cflags & CF_INVALID));
-diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
++
-index XXXXXXX..XXXXXXX 100644
+     hash = tb_jmp_cache_hash_func(pc);
---- a/accel/tcg/tcg-cpus-rr.h
+     tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
-+++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock * tb_lookup(CPUState *cpu,
                 tb->cs_base == cs_base &&
                 tb->flags == flags &&
                 tb->trace_vcpu_dstate == *cpu->trace_dstate &&
 -               (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == cf_mask)) {
 +               tb_cflags(tb) == cflags)) {
          return tb;
      }
 -    tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask);
 +    tb = tb_htable_lookup(cpu, pc, cs_base, flags, cflags);
      if (tb == NULL) {
          return NULL;
      }
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ struct tb_desc {
      CPUArchState *env;
      tb_page_addr_t phys_page1;
      uint32_t flags;
 -    uint32_t cf_mask;
 +    uint32_t cflags;
      uint32_t trace_vcpu_dstate;
  };
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
          tb->cs_base == desc->cs_base &&
          tb->flags == desc->flags &&
          tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
 -        (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == desc->cf_mask) {
 +        tb_cflags(tb) == desc->cflags) {
          /* check next page if needed */
          if (tb->page_addr[1] == -1) {
              return true;
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
  TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
                                     target_ulong cs_base, uint32_t flags,
 -                                   uint32_t cf_mask)
 +                                   uint32_t cflags)
  {
      tb_page_addr_t phys_pc;
      struct tb_desc desc;
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
      desc.env = (CPUArchState *)cpu->env_ptr;
      desc.cs_base = cs_base;
      desc.flags = flags;
 -    desc.cf_mask = cf_mask;
 +    desc.cflags = cflags;
      desc.trace_vcpu_dstate = *cpu->trace_dstate;
      desc.pc = pc;
      phys_pc = get_page_addr_code(desc.env, pc);
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
          return NULL;
      }
      desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
 -    h = tb_hash_func(phys_pc, pc, flags, cf_mask, *cpu->trace_dstate);
 +    h = tb_hash_func(phys_pc, pc, flags, cflags, *cpu->trace_dstate);
      return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);
  }
@@ -XXX,XX +XXX,XX @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
  static inline TranslationBlock *tb_find(CPUState *cpu,
                                          TranslationBlock *last_tb,
 -                                        int tb_exit, uint32_t cf_mask)
 +                                        int tb_exit, uint32_t cflags)
  {
      CPUArchState *env = (CPUArchState *)cpu->env_ptr;
      TranslationBlock *tb;
@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_find(CPUState *cpu,
      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
 -    tb = tb_lookup(cpu, pc, cs_base, flags, cf_mask);
 +    tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
      if (tb == NULL) {
          mmap_lock();
 -        tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
 +        tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
          mmap_unlock();
          /* We add the TB in the virtual pc hash table for the fast lookup */
          qatomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
 diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-runtime.c
 +++ b/accel/tcg/tcg-runtime.c
 @@ -XXX,XX +XXX,XX @@
- #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+ #include "exec/helper-proto.h"
+ #include "exec/cpu_ldst.h"
- /* Kick all RR vCPUs. */
+ #include "exec/exec-all.h"
--void qemu_cpu_kick_rr_cpus(CPUState *unused);
+-#include "exec/tb-lookup.h"
-+void rr_kick_vcpu_thread(CPUState *unused);
+ #include "disas/disas.h"
+ #include "exec/log.h"
- /* start the round robin vcpu thread */
+ #include "tcg/tcg.h"
- void rr_start_vcpu_thread(CPUState *cpu);
++#include "exec/tb-lookup.h"
-diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
-index XXXXXXX..XXXXXXX 100644
+ /* 32-bit helpers */
---- a/accel/tcg/tcg-cpus.h
-+++ b/accel/tcg/tcg-cpus.h
+diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
-@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
+index XXXXXXX..XXXXXXX 100644
- extern const CpusAccel tcg_cpus_icount;
+--- a/accel/tcg/translate-all.c
- extern const CpusAccel tcg_cpus_rr;
++++ b/accel/tcg/translate-all.c
+@@ -XXX,XX +XXX,XX @@ static bool tb_cmp(const void *ap, const void *bp)
--void qemu_tcg_destroy_vcpu(CPUState *cpu);
+     return a->pc == b->pc &&
--int tcg_cpu_exec(CPUState *cpu);
+         a->cs_base == b->cs_base &&
--void tcg_handle_interrupt(CPUState *cpu, int mask);
+         a->flags == b->flags &&
-+void tcg_cpus_destroy(CPUState *cpu);
+-        (tb_cflags(a) & CF_HASH_MASK) == (tb_cflags(b) & CF_HASH_MASK) &&
-+int tcg_cpus_exec(CPUState *cpu);
++        (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
-+void tcg_cpus_handle_interrupt(CPUState *cpu, int mask);
+         a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
+         a->page_addr[0] == b->page_addr[0] &&
- #endif /* TCG_CPUS_H */
+         a->page_addr[1] == b->page_addr[1];
-diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
+@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
-index XXXXXXX..XXXXXXX 100644
+     PageDesc *p;
---- a/accel/tcg/tcg-cpus-icount.c
+     uint32_t h;
-+++ b/accel/tcg/tcg-cpus-icount.c
+     tb_page_addr_t phys_pc;
-@@ -XXX,XX +XXX,XX @@
++    uint32_t orig_cflags = tb_cflags(tb);
- #include "tcg-cpus-icount.h"
- #include "tcg-cpus-rr.h"
+     assert_memory_lock();
--static int64_t tcg_get_icount_limit(void)
+@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
-+static int64_t icount_get_limit(void)
- {
+     /* remove the TB from the hash list */
-     int64_t deadline;
+     phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
+-    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb_cflags(tb) & CF_HASH_MASK,
-@@ -XXX,XX +XXX,XX @@ static int64_t tcg_get_icount_limit(void)
++    h = tb_hash_func(phys_pc, tb->pc, tb->flags, orig_cflags,
-     }
+                      tb->trace_vcpu_dstate);
- }
+     if (!qht_remove(&tb_ctx.htable, tb, h)) {
+         return;
--static void notify_aio_contexts(void)
+@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
-+static void icount_notify_aio_contexts(void)
+     uint32_t h;
- {
-     /* Wake up other AioContexts.  */
+     assert_memory_lock();
-     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
++    tcg_debug_assert(!(tb->cflags & CF_INVALID));
      qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
  }
 -void handle_icount_deadline(void)
 +void icount_handle_deadline(void)
  {
      assert(qemu_in_vcpu_thread());
      int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
                                                    QEMU_TIMER_ATTR_ALL);
      if (deadline == 0) {
 -        notify_aio_contexts();
 +        icount_notify_aio_contexts();
      }
  }
 -void prepare_icount_for_run(CPUState *cpu)
 +void icount_prepare_for_run(CPUState *cpu)
  {
      int insns_left;
      /*
--     * These should always be cleared by process_icount_data after
+      * Add the TB to the page list, acquiring first the pages's locks.
-+     * These should always be cleared by icount_process_data after
+@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
-      * each vCPU execution. However u16.high can be raised
+     }
--     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
-+     * asynchronously by cpu_exit/cpu_interrupt/tcg_cpus_handle_interrupt
+     /* add in the hash table */
-      */
+-    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags & CF_HASH_MASK,
-     g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
++    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags,
-     g_assert(cpu->icount_extra == 0);
+                      tb->trace_vcpu_dstate);
+     qht_insert(&tb_ctx.htable, tb, h, &existing_tb);
 -    cpu->icount_budget = tcg_get_icount_limit();
 +    cpu->icount_budget = icount_get_limit();
      insns_left = MIN(0xffff, cpu->icount_budget);
      cpu_neg(cpu)->icount_decr.u16.low = insns_left;
      cpu->icount_extra = cpu->icount_budget - insns_left;
@@ -XXX,XX +XXX,XX @@ void prepare_icount_for_run(CPUState *cpu)
      replay_mutex_lock();
      if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
 -        notify_aio_contexts();
 +        icount_notify_aio_contexts();
      }
  }
 -void process_icount_data(CPUState *cpu)
 +void icount_process_data(CPUState *cpu)
  {
      /* Account for executed instructions */
      icount_update(cpu);
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
  {
      int old_mask = cpu->interrupt_request;
 -    tcg_handle_interrupt(cpu, mask);
 +    tcg_cpus_handle_interrupt(cpu, mask);
      if (qemu_cpu_is_self(cpu) &&
          !cpu->can_do_io
          && (mask & ~old_mask) != 0) {
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
  const CpusAccel tcg_cpus_icount = {
      .create_vcpu_thread = rr_start_vcpu_thread,
 -    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 +    .kick_vcpu_thread = rr_kick_vcpu_thread,
      .handle_interrupt = icount_handle_interrupt,
      .get_virtual_clock = icount_get,
 diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-mttcg.c
 +++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
   * current CPUState for a given thread.
   */
 -static void *tcg_cpu_thread_fn(void *arg)
 +static void *mttcg_cpu_thread_fn(void *arg)
  {
      CPUState *cpu = arg;
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
          if (cpu_can_run(cpu)) {
              int r;
              qemu_mutex_unlock_iothread();
 -            r = tcg_cpu_exec(cpu);
 +            r = tcg_cpus_exec(cpu);
              qemu_mutex_lock_iothread();
              switch (r) {
              case EXCP_DEBUG:
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
          qemu_wait_io_event(cpu);
      } while (!cpu->unplug || cpu_can_run(cpu));
 -    qemu_tcg_destroy_vcpu(cpu);
 +    tcg_cpus_destroy(cpu);
      qemu_mutex_unlock_iothread();
      rcu_unregister_thread();
      return NULL;
@@ -XXX,XX +XXX,XX @@ static void mttcg_start_vcpu_thread(CPUState *cpu)
      snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
               cpu->cpu_index);
 -    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
 +    qemu_thread_create(cpu->thread, thread_name, mttcg_cpu_thread_fn,
                         cpu, QEMU_THREAD_JOINABLE);
  #ifdef _WIN32
@@ -XXX,XX +XXX,XX @@ const CpusAccel tcg_cpus_mttcg = {
      .create_vcpu_thread = mttcg_start_vcpu_thread,
      .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 -    .handle_interrupt = tcg_handle_interrupt,
 +    .handle_interrupt = tcg_cpus_handle_interrupt,
  };
 diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-rr.c
 +++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg-cpus-icount.h"
  /* Kick all RR vCPUs */
 -void qemu_cpu_kick_rr_cpus(CPUState *unused)
 +void rr_kick_vcpu_thread(CPUState *unused)
  {
      CPUState *cpu;
@@ -XXX,XX +XXX,XX @@ void qemu_cpu_kick_rr_cpus(CPUState *unused)
   * idleness is complete.
   */
 -static QEMUTimer *tcg_kick_vcpu_timer;
 -static CPUState *tcg_current_rr_cpu;
 +static QEMUTimer *rr_kick_vcpu_timer;
 +static CPUState *rr_current_cpu;
  #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 -static inline int64_t qemu_tcg_next_kick(void)
 +static inline int64_t rr_next_kick_time(void)
  {
      return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
  }
  /* Kick the currently round-robin scheduled vCPU to next */
 -static void qemu_cpu_kick_rr_next_cpu(void)
 +static void rr_kick_next_cpu(void)
  {
      CPUState *cpu;
      do {
 -        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
 +        cpu = qatomic_mb_read(&rr_current_cpu);
          if (cpu) {
              cpu_exit(cpu);
          }
 -    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
 +    } while (cpu != qatomic_mb_read(&rr_current_cpu));
  }
 -static void kick_tcg_thread(void *opaque)
 +static void rr_kick_thread(void *opaque)
  {
 -    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 -    qemu_cpu_kick_rr_next_cpu();
 +    timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
 +    rr_kick_next_cpu();
  }
 -static void start_tcg_kick_timer(void)
 +static void rr_start_kick_timer(void)
  {
 -    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 -        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 -                                           kick_tcg_thread, NULL);
 +    if (!rr_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 +        rr_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 +                                           rr_kick_thread, NULL);
      }
 -    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 +    if (rr_kick_vcpu_timer && !timer_pending(rr_kick_vcpu_timer)) {
 +        timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
      }
  }
 -static void stop_tcg_kick_timer(void)
 +static void rr_stop_kick_timer(void)
  {
 -    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_del(tcg_kick_vcpu_timer);
 +    if (rr_kick_vcpu_timer && timer_pending(rr_kick_vcpu_timer)) {
 +        timer_del(rr_kick_vcpu_timer);
      }
  }
 -static void qemu_tcg_rr_wait_io_event(void)
 +static void rr_wait_io_event(void)
  {
      CPUState *cpu;
      while (all_cpu_threads_idle()) {
 -        stop_tcg_kick_timer();
 +        rr_stop_kick_timer();
          qemu_cond_wait_iothread(first_cpu->halt_cond);
      }
 -    start_tcg_kick_timer();
 +    rr_start_kick_timer();
      CPU_FOREACH(cpu) {
          qemu_wait_io_event_common(cpu);
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_rr_wait_io_event(void)
   * Destroy any remaining vCPUs which have been unplugged and have
   * finished running
   */
 -static void deal_with_unplugged_cpus(void)
 +static void rr_deal_with_unplugged_cpus(void)
  {
      CPUState *cpu;
      CPU_FOREACH(cpu) {
          if (cpu->unplug && !cpu_can_run(cpu)) {
 -            qemu_tcg_destroy_vcpu(cpu);
 +            tcg_cpus_destroy(cpu);
              break;
          }
      }
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
   * elsewhere.
   */
 -static void *tcg_rr_cpu_thread_fn(void *arg)
 +static void *rr_cpu_thread_fn(void *arg)
  {
      CPUState *cpu = arg;
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
          }
      }
 -    start_tcg_kick_timer();
 +    rr_start_kick_timer();
      cpu = first_cpu;
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
               * Run the timers here.  This is much more efficient than
               * waking up the I/O thread and waiting for completion.
               */
 -            handle_icount_deadline();
 +            icount_handle_deadline();
          }
          replay_mutex_unlock();
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
          while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 -            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
 +            qatomic_mb_set(&rr_current_cpu, cpu);
              current_cpu = cpu;
              qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
                  qemu_mutex_unlock_iothread();
                  if (icount_enabled()) {
 -                    prepare_icount_for_run(cpu);
 +                    icount_prepare_for_run(cpu);
                  }
 -                r = tcg_cpu_exec(cpu);
 +                r = tcg_cpus_exec(cpu);
                  if (icount_enabled()) {
 -                    process_icount_data(cpu);
 +                    icount_process_data(cpu);
                  }
                  qemu_mutex_lock_iothread();
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
          } /* while (cpu && !cpu->exit_request).. */
          /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
 -        qatomic_set(&tcg_current_rr_cpu, NULL);
 +        qatomic_set(&rr_current_cpu, NULL);
          if (cpu && cpu->exit_request) {
              qatomic_mb_set(&cpu->exit_request, 0);
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
              qemu_notify_event();
          }
 -        qemu_tcg_rr_wait_io_event();
 -        deal_with_unplugged_cpus();
 +        rr_wait_io_event();
 +        rr_deal_with_unplugged_cpus();
      }
      rcu_unregister_thread();
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
          /* share a single thread for all cpus with TCG */
          snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
          qemu_thread_create(cpu->thread, thread_name,
 -                           tcg_rr_cpu_thread_fn,
 +                           rr_cpu_thread_fn,
                             cpu, QEMU_THREAD_JOINABLE);
          single_tcg_halt_cond = cpu->halt_cond;
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
  const CpusAccel tcg_cpus_rr = {
      .create_vcpu_thread = rr_start_vcpu_thread,
 -    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 +    .kick_vcpu_thread = rr_kick_vcpu_thread,
 -    .handle_interrupt = tcg_handle_interrupt,
 +    .handle_interrupt = tcg_cpus_handle_interrupt,
  };
 diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.c
 +++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
  /* common functionality among all TCG variants */
 -void qemu_tcg_destroy_vcpu(CPUState *cpu)
 +void tcg_cpus_destroy(CPUState *cpu)
  {
      cpu_thread_signal_destroyed(cpu);
  }
 -int tcg_cpu_exec(CPUState *cpu)
 +int tcg_cpus_exec(CPUState *cpu)
  {
      int ret;
  #ifdef CONFIG_PROFILER
@@ -XXX,XX +XXX,XX @@ int tcg_cpu_exec(CPUState *cpu)
  }
  /* mask must never be zero, except for A20 change call */
 -void tcg_handle_interrupt(CPUState *cpu, int mask)
 +void tcg_cpus_handle_interrupt(CPUState *cpu, int mask)
  {
      g_assert(qemu_mutex_iothread_locked());
 --
 .25.1

-New patch
+[PATCH 26/27] include/exec: lightly re-arrange TranslationBlock
+From: Alex Bennée <alex.bennee@linaro.org>
+Lets make sure all the flags we compare when looking up blocks are
+together in the same place.
+Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
+Message-Id: <20210224165811.11567-5-alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/exec/exec-all.h | 8 +++++---
+file changed, 5 insertions(+), 3 deletions(-)
+diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/exec/exec-all.h
++++ b/include/exec/exec-all.h
+@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
+     target_ulong pc;   /* simulated PC corresponding to this block (EIP + CS base) */
+     target_ulong cs_base; /* CS base for this block */
+     uint32_t flags; /* flags defining in which context the code was generated */
+-    uint16_t size;      /* size of target code for this block (1 <=
+-                           size <= TARGET_PAGE_SIZE) */
+-    uint16_t icount;
+     uint32_t cflags;    /* compile flags */
+ #define CF_COUNT_MASK  0x00007fff
+ #define CF_LAST_IO     0x00008000 /* Last insn may be an IO access.  */
+@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
+     /* Per-vCPU dynamic tracing state used to generate this TB */
+     uint32_t trace_vcpu_dstate;
++    /* Above fields used for comparing */
++    uint16_t size;      /* size of target code for this block (1 <=
++                           size <= TARGET_PAGE_SIZE) */
++    uint16_t icount;
++
+     struct tb_tc tc;
+     /* first and second physical page containing code. The lower bit
+--
+.25.1

-[PULL 2/3] accel/tcg: split tcg_start_vcpu_thread
+[PATCH 27/27] accel/tcg: Precompute curr_cflags into cpu->tcg_cflags
-From: Claudio Fontana <cfontana@suse.de>
+The primary motivation is to remove a dozen insns along
+the fast-path in tb_lookup.  As a byproduct, this allows
-after the initial split into 3 tcg variants, we proceed to also
+us to completely remove parallel_cpus.
-split tcg_start_vcpu_thread.
 We actually split it in 2 this time, since the icount variant
 just uses the round robin function.
 Suggested-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Message-Id: <20201015143217.29337-3-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-cpus-mttcg.h  | 21 --------------
+ accel/tcg/tcg-accel-ops.h       |  1 +
- accel/tcg/tcg-cpus-rr.h     |  3 +-
+ include/exec/exec-all.h         |  7 +------
- accel/tcg/tcg-cpus.h        |  1 -
+ include/hw/core/cpu.h           |  2 ++
- accel/tcg/tcg-all.c         |  5 ++++
+ accel/tcg/cpu-exec.c            |  3 ---
- accel/tcg/tcg-cpus-icount.c |  2 +-
+ accel/tcg/tcg-accel-ops-mttcg.c |  3 +--
- accel/tcg/tcg-cpus-mttcg.c  | 29 +++++++++++++++++--
+ accel/tcg/tcg-accel-ops-rr.c    |  2 +-
- accel/tcg/tcg-cpus-rr.c     | 39 +++++++++++++++++++++++--
+ accel/tcg/tcg-accel-ops.c       |  8 ++++++++
- accel/tcg/tcg-cpus.c        | 58 -------------------------------------
+ accel/tcg/translate-all.c       |  4 ----
-files changed, 71 insertions(+), 87 deletions(-)
+ linux-user/main.c               |  1 +
- delete mode 100644 accel/tcg/tcg-cpus-mttcg.h
+ linux-user/sh4/signal.c         |  8 +++++---
+ linux-user/syscall.c            | 18 ++++++++++--------
-diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
+files changed, 30 insertions(+), 27 deletions(-)
-deleted file mode 100644
-index XXXXXXX..XXXXXXX
+diff --git a/accel/tcg/tcg-accel-ops.h b/accel/tcg/tcg-accel-ops.h
---- a/accel/tcg/tcg-cpus-mttcg.h
+index XXXXXXX..XXXXXXX 100644
-+++ /dev/null
+--- a/accel/tcg/tcg-accel-ops.h
 +++ b/accel/tcg/tcg-accel-ops.h
 @@ -XXX,XX +XXX,XX @@
--/*
+ void tcg_cpus_destroy(CPUState *cpu);
-- * QEMU TCG Multi Threaded vCPUs implementation
+ int tcg_cpus_exec(CPUState *cpu);
-- *
+ void tcg_handle_interrupt(CPUState *cpu, int mask);
-- * Copyright 2020 SUSE LLC
++void tcg_cpu_init_cflags(CPUState *cpu, bool parallel);
-- *
-- * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ #endif /* TCG_CPUS_H */
-- * See the COPYING file in the top-level directory.
+diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
-- */
+index XXXXXXX..XXXXXXX 100644
--
+--- a/include/exec/exec-all.h
--#ifndef TCG_CPUS_MTTCG_H
++++ b/include/exec/exec-all.h
--#define TCG_CPUS_MTTCG_H
+@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
--
+     uintptr_t jmp_dest[2];
--/*
+ };
-- * In the multi-threaded case each vCPU has its own thread. The TLS
-- * variable current_cpu can be used deep in the code to find the
+-extern bool parallel_cpus;
-- * current CPUState for a given thread.
+-
-- */
+ /* Hide the qatomic_read to make code a little easier on the eyes */
--
+ static inline uint32_t tb_cflags(const TranslationBlock *tb)
--void *tcg_cpu_thread_fn(void *arg);
+ {
--
+@@ -XXX,XX +XXX,XX @@ static inline uint32_t tb_cflags(const TranslationBlock *tb)
--#endif /* TCG_CPUS_MTTCG_H */
+ /* current cflags for hashing/comparison */
-diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
+ static inline uint32_t curr_cflags(CPUState *cpu)
-index XXXXXXX..XXXXXXX 100644
+ {
---- a/accel/tcg/tcg-cpus-rr.h
+-    uint32_t cflags = deposit32(0, CF_CLUSTER_SHIFT, 8, cpu->cluster_index);
-+++ b/accel/tcg/tcg-cpus-rr.h
+-    cflags |= parallel_cpus ? CF_PARALLEL : 0;
 -    cflags |= icount_enabled() ? CF_USE_ICOUNT : 0;
 -    return cflags;
 +    return cpu->tcg_cflags;
  }
  /* TranslationBlock invalidate API */
 diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/core/cpu.h
 +++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ struct qemu_work_item;
   *   to a cluster this will be UNASSIGNED_CLUSTER_INDEX; otherwise it will
   *   be the same as the cluster-id property of the CPU object's TYPE_CPU_CLUSTER
   *   QOM parent.
 + * @tcg_cflags: Pre-computed cflags for this cpu.
   * @nr_cores: Number of cores within this CPU package.
   * @nr_threads: Number of threads within this CPU.
   * @running: #true if CPU is currently running (lockless).
@@ -XXX,XX +XXX,XX @@ struct CPUState {
      /* TODO Move common fields from CPUArchState here. */
      int cpu_index;
      int cluster_index;
 +    uint32_t tcg_cflags;
      uint32_t halted;
      uint32_t can_do_io;
      int32_t exception_index;
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
              mmap_unlock();
          }
 -        /* Since we got here, we know that parallel_cpus must be true.  */
 -        parallel_cpus = false;
          cpu_exec_enter(cpu);
          /* execute the generated code */
          trace_exec_tb(tb, pc);
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
       * the execution.
       */
      g_assert(cpu_in_exclusive_context(cpu));
 -    parallel_cpus = true;
      cpu->running = false;
      end_exclusive();
  }
 diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-accel-ops-mttcg.c
 +++ b/accel/tcg/tcg-accel-ops-mttcg.c
@@ -XXX,XX +XXX,XX @@ void mttcg_start_vcpu_thread(CPUState *cpu)
      char thread_name[VCPU_THREAD_NAME_SIZE];
      g_assert(tcg_enabled());
 -
 -    parallel_cpus = (current_machine->smp.max_cpus > 1);
 +    tcg_cpu_init_cflags(cpu, current_machine->smp.max_cpus > 1);
      cpu->thread = g_malloc0(sizeof(QemuThread));
      cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-accel-ops-rr.c
 +++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
      static QemuThread *single_tcg_cpu_thread;
      g_assert(tcg_enabled());
 -    parallel_cpus = false;
 +    tcg_cpu_init_cflags(cpu, false);
      if (!single_tcg_cpu_thread) {
          cpu->thread = g_malloc0(sizeof(QemuThread));
 diff --git a/accel/tcg/tcg-accel-ops.c b/accel/tcg/tcg-accel-ops.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-accel-ops.c
 +++ b/accel/tcg/tcg-accel-ops.c
 @@ -XXX,XX +XXX,XX @@
- /* Kick all RR vCPUs. */
- void qemu_cpu_kick_rr_cpus(CPUState *unused);
+ /* common functionality among all TCG variants */
--void *tcg_rr_cpu_thread_fn(void *arg);
++void tcg_cpu_init_cflags(CPUState *cpu, bool parallel)
 +/* start the round robin vcpu thread */
 +void rr_start_vcpu_thread(CPUState *cpu);
  #endif /* TCG_CPUS_RR_H */
 diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.h
 +++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
  extern const CpusAccel tcg_cpus_icount;
  extern const CpusAccel tcg_cpus_rr;
 -void tcg_start_vcpu_thread(CPUState *cpu);
  void qemu_tcg_destroy_vcpu(CPUState *cpu);
  int tcg_cpu_exec(CPUState *cpu);
  void tcg_handle_interrupt(CPUState *cpu, int mask);
 diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-all.c
 +++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
      tcg_exec_init(s->tb_size * 1024 * 1024);
      mttcg_enabled = s->mttcg_enabled;
 +    /*
 +     * Initialize TCG regions
 +     */
 +    tcg_region_init();
 +
      if (mttcg_enabled) {
          cpus_register_accel(&tcg_cpus_mttcg);
      } else if (icount_enabled()) {
 diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-icount.c
 +++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
  }
  const CpusAccel tcg_cpus_icount = {
 -    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .create_vcpu_thread = rr_start_vcpu_thread,
      .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
      .handle_interrupt = icount_handle_interrupt,
 diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-mttcg.c
 +++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/boards.h"
  #include "tcg-cpus.h"
 -#include "tcg-cpus-mttcg.h"
  /*
   * In the multi-threaded case each vCPU has its own thread. The TLS
@@ -XXX,XX +XXX,XX @@
   * current CPUState for a given thread.
   */
 -void *tcg_cpu_thread_fn(void *arg)
 +static void *tcg_cpu_thread_fn(void *arg)
  {
      CPUState *cpu = arg;
@@ -XXX,XX +XXX,XX @@ static void mttcg_kick_vcpu_thread(CPUState *cpu)
      cpu_exit(cpu);
  }
 +static void mttcg_start_vcpu_thread(CPUState *cpu)
 +{
-+    char thread_name[VCPU_THREAD_NAME_SIZE];
++    uint32_t cflags = cpu->cluster_index << CF_CLUSTER_SHIFT;
-+
++    cflags |= parallel ? CF_PARALLEL : 0;
-+    g_assert(tcg_enabled());
++    cflags |= icount_enabled() ? CF_USE_ICOUNT : 0;
-+
++    cpu->tcg_cflags = cflags;
 +    parallel_cpus = (current_machine->smp.max_cpus > 1);
 +
 +    cpu->thread = g_malloc0(sizeof(QemuThread));
 +    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 +    qemu_cond_init(cpu->halt_cond);
 +
 +    /* create a thread per vCPU with TCG (MTTCG) */
 +    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
 +             cpu->cpu_index);
 +
 +    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
 +                       cpu, QEMU_THREAD_JOINABLE);
 +
 +#ifdef _WIN32
 +    cpu->hThread = qemu_thread_get_handle(cpu->thread);
 +#endif
 +}
 +
- const CpusAccel tcg_cpus_mttcg = {
+ void tcg_cpus_destroy(CPUState *cpu)
--    .create_vcpu_thread = tcg_start_vcpu_thread,
+ {
-+    .create_vcpu_thread = mttcg_start_vcpu_thread,
+     cpu_thread_signal_destroyed(cpu);
-     .kick_vcpu_thread = mttcg_kick_vcpu_thread,
+diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+index XXXXXXX..XXXXXXX 100644
-     .handle_interrupt = tcg_handle_interrupt,
+--- a/accel/tcg/translate-all.c
-diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
++++ b/accel/tcg/translate-all.c
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ static void *l1_map[V_L1_MAX_SIZE];
---- a/accel/tcg/tcg-cpus-rr.c
+ TCGContext tcg_init_ctx;
-+++ b/accel/tcg/tcg-cpus-rr.c
+ __thread TCGContext *tcg_ctx;
-@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
+ TBContext tb_ctx;
-  * elsewhere.
+-bool parallel_cpus;
-  */
+ static void page_table_config_init(void)
--void *tcg_rr_cpu_thread_fn(void *arg)
+ {
-+static void *tcg_rr_cpu_thread_fn(void *arg)
+@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
- {
+         cflags = (cflags & ~CF_COUNT_MASK) | 1;
-     CPUState *cpu = arg;
+     }
-@@ -XXX,XX +XXX,XX @@ void *tcg_rr_cpu_thread_fn(void *arg)
+-    cflags &= ~CF_CLUSTER_MASK;
-     return NULL;
+-    cflags |= cpu->cluster_index << CF_CLUSTER_SHIFT;
 -
      max_insns = cflags & CF_COUNT_MASK;
      if (max_insns == 0) {
          max_insns = CF_COUNT_MASK;
 diff --git a/linux-user/main.c b/linux-user/main.c
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/main.c
 +++ b/linux-user/main.c
@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env)
      /* Reset non arch specific state */
      cpu_reset(new_cpu);
 +    new_cpu->tcg_cflags = cpu->tcg_cflags;
      memcpy(new_env, env, sizeof(CPUArchState));
      /* Clone all break/watchpoints.
 diff --git a/linux-user/sh4/signal.c b/linux-user/sh4/signal.c
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/sh4/signal.c
 +++ b/linux-user/sh4/signal.c
@@ -XXX,XX +XXX,XX @@ static abi_ulong get_sigframe(struct target_sigaction *ka,
      return (sp - frame_size) & -8ul;
  }
-+void rr_start_vcpu_thread(CPUState *cpu)
+-/* Notice when we're in the middle of a gUSA region and reset.
-+{
+-   Note that this will only occur for !parallel_cpus, as we will
-+    char thread_name[VCPU_THREAD_NAME_SIZE];
+-   translate such sequences differently in a parallel context.  */
-+    static QemuCond *single_tcg_halt_cond;
++/*
-+    static QemuThread *single_tcg_cpu_thread;
++ * Notice when we're in the middle of a gUSA region and reset.
 + * Note that this will only occur when #CF_PARALLEL is unset, as we
 + * will translate such sequences differently in a parallel context.
 + */
  static void unwind_gusa(CPUSH4State *regs)
  {
      /* If the stack pointer is sufficiently negative, and we haven't
 diff --git a/linux-user/syscall.c b/linux-user/syscall.c
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/syscall.c
 +++ b/linux-user/syscall.c
@@ -XXX,XX +XXX,XX @@ static int do_fork(CPUArchState *env, unsigned int flags, abi_ulong newsp,
          /* Grab a mutex so that thread setup appears atomic.  */
          pthread_mutex_lock(&clone_lock);
 +        /*
 +         * If this is our first additional thread, we need to ensure we
 +         * generate code for parallel execution and flush old translations.
 +         * Do this now so that the copy gets CF_PARALLEL too.
 +         */
 +        if (!(cpu->tcg_cflags & CF_PARALLEL)) {
 +            cpu->tcg_cflags |= CF_PARALLEL;
 +            tb_flush(cpu);
 +        }
 +
-+    g_assert(tcg_enabled());
+         /* we create a new CPU instance. */
-+    parallel_cpus = false;
+         new_env = cpu_copy(env);
-+
+         /* Init regs that differ from the parent.  */
-+    if (!single_tcg_cpu_thread) {
+@@ -XXX,XX +XXX,XX @@ static int do_fork(CPUArchState *env, unsigned int flags, abi_ulong newsp,
-+        cpu->thread = g_malloc0(sizeof(QemuThread));
+         sigprocmask(SIG_BLOCK, &sigmask, &info.sigmask);
-+        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
+         cpu->random_seed = qemu_guest_random_seed_thread_part1();
-+        qemu_cond_init(cpu->halt_cond);
-+
+-        /* If this is our first additional thread, we need to ensure we
-+        /* share a single thread for all cpus with TCG */
+-         * generate code for parallel execution and flush old translations.
-+        snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
+-         */
-+        qemu_thread_create(cpu->thread, thread_name,
+-        if (!parallel_cpus) {
-+                           tcg_rr_cpu_thread_fn,
+-            parallel_cpus = true;
-+                           cpu, QEMU_THREAD_JOINABLE);
+-            tb_flush(cpu);
 +
 +        single_tcg_halt_cond = cpu->halt_cond;
 +        single_tcg_cpu_thread = cpu->thread;
 +#ifdef _WIN32
 +        cpu->hThread = qemu_thread_get_handle(cpu->thread);
 +#endif
 +    } else {
 +        /* we share the thread */
 +        cpu->thread = single_tcg_cpu_thread;
 +        cpu->halt_cond = single_tcg_halt_cond;
 +        cpu->thread_id = first_cpu->thread_id;
 +        cpu->can_do_io = 1;
 +        cpu->created = true;
 +    }
 +}
 +
  const CpusAccel tcg_cpus_rr = {
 -    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .create_vcpu_thread = rr_start_vcpu_thread,
      .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
      .handle_interrupt = tcg_handle_interrupt,
 diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.c
 +++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/boards.h"
  #include "tcg-cpus.h"
 -#include "tcg-cpus-mttcg.h"
 -#include "tcg-cpus-rr.h"
  /* common functionality among all TCG variants */
 -void tcg_start_vcpu_thread(CPUState *cpu)
 -{
 -    char thread_name[VCPU_THREAD_NAME_SIZE];
 -    static QemuCond *single_tcg_halt_cond;
 -    static QemuThread *single_tcg_cpu_thread;
 -    static int tcg_region_inited;
 -
 -    assert(tcg_enabled());
 -    /*
 -     * Initialize TCG regions--once. Now is a good time, because:
 -     * (1) TCG's init context, prologue and target globals have been set up.
 -     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
 -     *     -accel flag is processed, so the check doesn't work then).
 -     */
 -    if (!tcg_region_inited) {
 -        tcg_region_inited = 1;
 -        tcg_region_init();
 -        parallel_cpus = qemu_tcg_mttcg_enabled() && current_machine->smp.max_cpus > 1;
 -    }
 -
 -    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
 -        cpu->thread = g_malloc0(sizeof(QemuThread));
 -        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 -        qemu_cond_init(cpu->halt_cond);
 -
 -        if (qemu_tcg_mttcg_enabled()) {
 -            /* create a thread per vCPU with TCG (MTTCG) */
 -            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
 -                 cpu->cpu_index);
 -
 -            qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
 -                               cpu, QEMU_THREAD_JOINABLE);
 -
 -        } else {
 -            /* share a single thread for all cpus with TCG */
 -            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
 -            qemu_thread_create(cpu->thread, thread_name,
 -                               tcg_rr_cpu_thread_fn,
 -                               cpu, QEMU_THREAD_JOINABLE);
 -
 -            single_tcg_halt_cond = cpu->halt_cond;
 -            single_tcg_cpu_thread = cpu->thread;
 -        }
--#ifdef _WIN32
+-
--        cpu->hThread = qemu_thread_get_handle(cpu->thread);
+         ret = pthread_create(&info.thread, &attr, clone_func, &info);
--#endif
+         /* TODO: Free new CPU state if thread creation failed.  */
--    } else {
 -        /* For non-MTTCG cases we share the thread */
 -        cpu->thread = single_tcg_cpu_thread;
 -        cpu->halt_cond = single_tcg_halt_cond;
 -        cpu->thread_id = first_cpu->thread_id;
 -        cpu->can_do_io = 1;
 -        cpu->created = true;
 -    }
 -}
 -
  void qemu_tcg_destroy_vcpu(CPUState *cpu)
  {
      cpu_thread_signal_destroyed(cpu);
 --
 .25.1

The following changes since commit 2ecfc0657afa5d29a373271b342f704a1a3c6737:

Merge remote-tracking branch 'remotes/armbru/tags/pull-misc-2020-12-10' into staging (2020-12-10 17:01:05 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20201210

for you to fetch changes up to 9e2658d62ebc23efe7df43fc0e306f129510d874:

accel/tcg: rename tcg-cpus functions to match module name (2020-12-10 17:44:10 -0600)

----------------------------------------------------------------
Split CpusAccel for tcg variants

----------------------------------------------------------------
Claudio Fontana (3):
      accel/tcg: split CpusAccel into three TCG variants
      accel/tcg: split tcg_start_vcpu_thread
      accel/tcg: rename tcg-cpus functions to match module name

accel/tcg/tcg-cpus-icount.h |  17 ++
 accel/tcg/tcg-cpus-rr.h     |  21 ++
 accel/tcg/tcg-cpus.h        |  12 +-
 accel/tcg/tcg-all.c         |  13 +-
 accel/tcg/tcg-cpus-icount.c | 147 +++++++++++++
 accel/tcg/tcg-cpus-mttcg.c  | 140 ++++++++++++
 accel/tcg/tcg-cpus-rr.c     | 305 ++++++++++++++++++++++++++
 accel/tcg/tcg-cpus.c        | 506 +-------------------------------------------
 softmmu/icount.c            |   2 +-
 accel/tcg/meson.build       |   9 +-
 10 files changed, 670 insertions(+), 502 deletions(-)
 create mode 100644 accel/tcg/tcg-cpus-icount.h
 create mode 100644 accel/tcg/tcg-cpus-rr.h
 create mode 100644 accel/tcg/tcg-cpus-icount.c
 create mode 100644 accel/tcg/tcg-cpus-mttcg.c
 create mode 100644 accel/tcg/tcg-cpus-rr.c

From: Claudio Fontana <cfontana@suse.de>

split up the CpusAccel tcg_cpus into three TCG variants:

tcg_cpus_rr (single threaded, round robin cpus)
tcg_cpus_icount (same as rr, but with instruction counting enabled)
tcg_cpus_mttcg (multi-threaded cpus)

Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20201015143217.29337-2-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-cpus-icount.h |  17 ++
 accel/tcg/tcg-cpus-mttcg.h  |  21 ++
 accel/tcg/tcg-cpus-rr.h     |  20 ++
 accel/tcg/tcg-cpus.h        |  13 +-
 accel/tcg/tcg-all.c         |   8 +-
 accel/tcg/tcg-cpus-icount.c | 147 +++++++++++
 accel/tcg/tcg-cpus-mttcg.c  | 117 +++++++++
 accel/tcg/tcg-cpus-rr.c     | 270 ++++++++++++++++++++
 accel/tcg/tcg-cpus.c        | 484 ++----------------------------------
 softmmu/icount.c            |   2 +-
 accel/tcg/meson.build       |   9 +-
 11 files changed, 646 insertions(+), 462 deletions(-)
 create mode 100644 accel/tcg/tcg-cpus-icount.h
 create mode 100644 accel/tcg/tcg-cpus-mttcg.h
 create mode 100644 accel/tcg/tcg-cpus-rr.h
 create mode 100644 accel/tcg/tcg-cpus-icount.c
 create mode 100644 accel/tcg/tcg-cpus-mttcg.c
 create mode 100644 accel/tcg/tcg-cpus-rr.c

diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-icount.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation using instruction counting
+ *
+ * Copyright 2020 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPUS_ICOUNT_H
+#define TCG_CPUS_ICOUNT_H
+
+void handle_icount_deadline(void);
+void prepare_icount_for_run(CPUState *cpu);
+void process_icount_data(CPUState *cpu);
+
+#endif /* TCG_CPUS_ICOUNT_H */
diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-mttcg.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Multi Threaded vCPUs implementation
+ *
+ * Copyright 2020 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPUS_MTTCG_H
+#define TCG_CPUS_MTTCG_H
+
+/*
+ * In the multi-threaded case each vCPU has its own thread. The TLS
+ * variable current_cpu can be used deep in the code to find the
+ * current CPUState for a given thread.
+ */
+
+void *tcg_cpu_thread_fn(void *arg);
+
+#endif /* TCG_CPUS_MTTCG_H */
diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation
+ *
+ * Copyright 2020 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPUS_RR_H
+#define TCG_CPUS_RR_H
+
+#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+
+/* Kick all RR vCPUs. */
+void qemu_cpu_kick_rr_cpus(CPUState *unused);
+
+void *tcg_rr_cpu_thread_fn(void *arg);
+
+#endif /* TCG_CPUS_RR_H */
diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.h
+++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@
 /*
- * Accelerator CPUS Interface
+ * QEMU TCG vCPU common functionality
+ *
+ * Functionality common to all TCG vcpu variants: mttcg, rr and icount.
  *
  * Copyright 2020 SUSE LLC
  *
@@ -XXX,XX +XXX,XX @@
 
 #include "sysemu/cpus.h"
 
-extern const CpusAccel tcg_cpus;
+extern const CpusAccel tcg_cpus_mttcg;
+extern const CpusAccel tcg_cpus_icount;
+extern const CpusAccel tcg_cpus_rr;
+
+void tcg_start_vcpu_thread(CPUState *cpu);
+void qemu_tcg_destroy_vcpu(CPUState *cpu);
+int tcg_cpu_exec(CPUState *cpu);
+void tcg_handle_interrupt(CPUState *cpu, int mask);
 
 #endif /* TCG_CPUS_H */
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
 
     tcg_exec_init(s->tb_size * 1024 * 1024);
     mttcg_enabled = s->mttcg_enabled;
-    cpus_register_accel(&tcg_cpus);
 
+    if (mttcg_enabled) {
+        cpus_register_accel(&tcg_cpus_mttcg);
+    } else if (icount_enabled()) {
+        cpus_register_accel(&tcg_cpus_icount);
+    } else {
+        cpus_register_accel(&tcg_cpus_rr);
+    }
     return 0;
 }
 
diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation using instruction counting
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/tcg.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
+#include "exec/exec-all.h"
+#include "hw/boards.h"
+
+#include "tcg-cpus.h"
+#include "tcg-cpus-icount.h"
+#include "tcg-cpus-rr.h"
+
+static int64_t tcg_get_icount_limit(void)
+{
+    int64_t deadline;
+
+    if (replay_mode != REPLAY_MODE_PLAY) {
+        /*
+         * Include all the timers, because they may need an attention.
+         * Too long CPU execution may create unnecessary delay in UI.
+         */
+        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                              QEMU_TIMER_ATTR_ALL);
+        /* Check realtime timers, because they help with input processing */
+        deadline = qemu_soonest_timeout(deadline,
+                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
+                                           QEMU_TIMER_ATTR_ALL));
+
+        /*
+         * Maintain prior (possibly buggy) behaviour where if no deadline
+         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
+         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
+         * nanoseconds.
+         */
+        if ((deadline < 0) || (deadline > INT32_MAX)) {
+            deadline = INT32_MAX;
+        }
+
+        return icount_round(deadline);
+    } else {
+        return replay_get_instructions();
+    }
+}
+
+static void notify_aio_contexts(void)
+{
+    /* Wake up other AioContexts.  */
+    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
+}
+
+void handle_icount_deadline(void)
+{
+    assert(qemu_in_vcpu_thread());
+    int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                                  QEMU_TIMER_ATTR_ALL);
+
+    if (deadline == 0) {
+        notify_aio_contexts();
+    }
+}
+
+void prepare_icount_for_run(CPUState *cpu)
+{
+    int insns_left;
+
+    /*
+     * These should always be cleared by process_icount_data after
+     * each vCPU execution. However u16.high can be raised
+     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
+     */
+    g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
+    g_assert(cpu->icount_extra == 0);
+
+    cpu->icount_budget = tcg_get_icount_limit();
+    insns_left = MIN(0xffff, cpu->icount_budget);
+    cpu_neg(cpu)->icount_decr.u16.low = insns_left;
+    cpu->icount_extra = cpu->icount_budget - insns_left;
+
+    replay_mutex_lock();
+
+    if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
+        notify_aio_contexts();
+    }
+}
+
+void process_icount_data(CPUState *cpu)
+{
+    /* Account for executed instructions */
+    icount_update(cpu);
+
+    /* Reset the counters */
+    cpu_neg(cpu)->icount_decr.u16.low = 0;
+    cpu->icount_extra = 0;
+    cpu->icount_budget = 0;
+
+    replay_account_executed_instructions();
+
+    replay_mutex_unlock();
+}
+
+static void icount_handle_interrupt(CPUState *cpu, int mask)
+{
+    int old_mask = cpu->interrupt_request;
+
+    tcg_handle_interrupt(cpu, mask);
+    if (qemu_cpu_is_self(cpu) &&
+        !cpu->can_do_io
+        && (mask & ~old_mask) != 0) {
+        cpu_abort(cpu, "Raised interrupt while not in I/O function");
+    }
+}
+
+const CpusAccel tcg_cpus_icount = {
+    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+
+    .handle_interrupt = icount_handle_interrupt,
+    .get_virtual_clock = icount_get,
+    .get_elapsed_ticks = icount_get,
+};
diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Multi Threaded vCPUs implementation
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/tcg.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
+#include "exec/exec-all.h"
+#include "hw/boards.h"
+
+#include "tcg-cpus.h"
+#include "tcg-cpus-mttcg.h"
+
+/*
+ * In the multi-threaded case each vCPU has its own thread. The TLS
+ * variable current_cpu can be used deep in the code to find the
+ * current CPUState for a given thread.
+ */
+
+void *tcg_cpu_thread_fn(void *arg)
+{
+    CPUState *cpu = arg;
+
+    assert(tcg_enabled());
+    g_assert(!icount_enabled());
+
+    rcu_register_thread();
+    tcg_register_thread();
+
+    qemu_mutex_lock_iothread();
+    qemu_thread_get_self(cpu->thread);
+
+    cpu->thread_id = qemu_get_thread_id();
+    cpu->can_do_io = 1;
+    current_cpu = cpu;
+    cpu_thread_signal_created(cpu);
+    qemu_guest_random_seed_thread_part2(cpu->random_seed);
+
+    /* process any pending work */
+    cpu->exit_request = 1;
+
+    do {
+        if (cpu_can_run(cpu)) {
+            int r;
+            qemu_mutex_unlock_iothread();
+            r = tcg_cpu_exec(cpu);
+            qemu_mutex_lock_iothread();
+            switch (r) {
+            case EXCP_DEBUG:
+                cpu_handle_guest_debug(cpu);
+                break;
+            case EXCP_HALTED:
+                /*
+                 * during start-up the vCPU is reset and the thread is
+                 * kicked several times. If we don't ensure we go back
+                 * to sleep in the halted state we won't cleanly
+                 * start-up when the vCPU is enabled.
+                 *
+                 * cpu->halted should ensure we sleep in wait_io_event
+                 */
+                g_assert(cpu->halted);
+                break;
+            case EXCP_ATOMIC:
+                qemu_mutex_unlock_iothread();
+                cpu_exec_step_atomic(cpu);
+                qemu_mutex_lock_iothread();
+            default:
+                /* Ignore everything else? */
+                break;
+            }
+        }
+
+        qatomic_mb_set(&cpu->exit_request, 0);
+        qemu_wait_io_event(cpu);
+    } while (!cpu->unplug || cpu_can_run(cpu));
+
+    qemu_tcg_destroy_vcpu(cpu);
+    qemu_mutex_unlock_iothread();
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void mttcg_kick_vcpu_thread(CPUState *cpu)
+{
+    cpu_exit(cpu);
+}
+
+const CpusAccel tcg_cpus_mttcg = {
+    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .kick_vcpu_thread = mttcg_kick_vcpu_thread,
+
+    .handle_interrupt = tcg_handle_interrupt,
+};
diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/tcg.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
+#include "exec/exec-all.h"
+#include "hw/boards.h"
+
+#include "tcg-cpus.h"
+#include "tcg-cpus-rr.h"
+#include "tcg-cpus-icount.h"
+
+/* Kick all RR vCPUs */
+void qemu_cpu_kick_rr_cpus(CPUState *unused)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        cpu_exit(cpu);
+    };
+}
+
+/*
+ * TCG vCPU kick timer
+ *
+ * The kick timer is responsible for moving single threaded vCPU
+ * emulation on to the next vCPU. If more than one vCPU is running a
+ * timer event with force a cpu->exit so the next vCPU can get
+ * scheduled.
+ *
+ * The timer is removed if all vCPUs are idle and restarted again once
+ * idleness is complete.
+ */
+
+static QEMUTimer *tcg_kick_vcpu_timer;
+static CPUState *tcg_current_rr_cpu;
+
+#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+
+static inline int64_t qemu_tcg_next_kick(void)
+{
+    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
+}
+
+/* Kick the currently round-robin scheduled vCPU to next */
+static void qemu_cpu_kick_rr_next_cpu(void)
+{
+    CPUState *cpu;
+    do {
+        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
+        if (cpu) {
+            cpu_exit(cpu);
+        }
+    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
+}
+
+static void kick_tcg_thread(void *opaque)
+{
+    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    qemu_cpu_kick_rr_next_cpu();
+}
+
+static void start_tcg_kick_timer(void)
+{
+    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
+        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                                           kick_tcg_thread, NULL);
+    }
+    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
+        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    }
+}
+
+static void stop_tcg_kick_timer(void)
+{
+    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
+        timer_del(tcg_kick_vcpu_timer);
+    }
+}
+
+static void qemu_tcg_rr_wait_io_event(void)
+{
+    CPUState *cpu;
+
+    while (all_cpu_threads_idle()) {
+        stop_tcg_kick_timer();
+        qemu_cond_wait_iothread(first_cpu->halt_cond);
+    }
+
+    start_tcg_kick_timer();
+
+    CPU_FOREACH(cpu) {
+        qemu_wait_io_event_common(cpu);
+    }
+}
+
+/*
+ * Destroy any remaining vCPUs which have been unplugged and have
+ * finished running
+ */
+static void deal_with_unplugged_cpus(void)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        if (cpu->unplug && !cpu_can_run(cpu)) {
+            qemu_tcg_destroy_vcpu(cpu);
+            break;
+        }
+    }
+}
+
+/*
+ * In the single-threaded case each vCPU is simulated in turn. If
+ * there is more than a single vCPU we create a simple timer to kick
+ * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
+ * This is done explicitly rather than relying on side-effects
+ * elsewhere.
+ */
+
+void *tcg_rr_cpu_thread_fn(void *arg)
+{
+    CPUState *cpu = arg;
+
+    assert(tcg_enabled());
+    rcu_register_thread();
+    tcg_register_thread();
+
+    qemu_mutex_lock_iothread();
+    qemu_thread_get_self(cpu->thread);
+
+    cpu->thread_id = qemu_get_thread_id();
+    cpu->can_do_io = 1;
+    cpu_thread_signal_created(cpu);
+    qemu_guest_random_seed_thread_part2(cpu->random_seed);
+
+    /* wait for initial kick-off after machine start */
+    while (first_cpu->stopped) {
+        qemu_cond_wait_iothread(first_cpu->halt_cond);
+
+        /* process any pending work */
+        CPU_FOREACH(cpu) {
+            current_cpu = cpu;
+            qemu_wait_io_event_common(cpu);
+        }
+    }
+
+    start_tcg_kick_timer();
+
+    cpu = first_cpu;
+
+    /* process any pending work */
+    cpu->exit_request = 1;
+
+    while (1) {
+        qemu_mutex_unlock_iothread();
+        replay_mutex_lock();
+        qemu_mutex_lock_iothread();
+
+        if (icount_enabled()) {
+            /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
+            icount_account_warp_timer();
+            /*
+             * Run the timers here.  This is much more efficient than
+             * waking up the I/O thread and waiting for completion.
+             */
+            handle_icount_deadline();
+        }
+
+        replay_mutex_unlock();
+
+        if (!cpu) {
+            cpu = first_cpu;
+        }
+
+        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
+
+            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
+            current_cpu = cpu;
+
+            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
+                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
+
+            if (cpu_can_run(cpu)) {
+                int r;
+
+                qemu_mutex_unlock_iothread();
+                if (icount_enabled()) {
+                    prepare_icount_for_run(cpu);
+                }
+                r = tcg_cpu_exec(cpu);
+                if (icount_enabled()) {
+                    process_icount_data(cpu);
+                }
+                qemu_mutex_lock_iothread();
+
+                if (r == EXCP_DEBUG) {
+                    cpu_handle_guest_debug(cpu);
+                    break;
+                } else if (r == EXCP_ATOMIC) {
+                    qemu_mutex_unlock_iothread();
+                    cpu_exec_step_atomic(cpu);
+                    qemu_mutex_lock_iothread();
+                    break;
+                }
+            } else if (cpu->stop) {
+                if (cpu->unplug) {
+                    cpu = CPU_NEXT(cpu);
+                }
+                break;
+            }
+
+            cpu = CPU_NEXT(cpu);
+        } /* while (cpu && !cpu->exit_request).. */
+
+        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
+        qatomic_set(&tcg_current_rr_cpu, NULL);
+
+        if (cpu && cpu->exit_request) {
+            qatomic_mb_set(&cpu->exit_request, 0);
+        }
+
+        if (icount_enabled() && all_cpu_threads_idle()) {
+            /*
+             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
+             * in the main_loop, wake it up in order to start the warp timer.
+             */
+            qemu_notify_event();
+        }
+
+        qemu_tcg_rr_wait_io_event();
+        deal_with_unplugged_cpus();
+    }
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+const CpusAccel tcg_cpus_rr = {
+    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+
+    .handle_interrupt = tcg_handle_interrupt,
+};
diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.c
+++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
 /*
- * QEMU System Emulator
+ * QEMU TCG vCPU common functionality
+ *
+ * Functionality common to all TCG vCPU variants: mttcg, rr and icount.
  *
  * Copyright (c) 2003-2008 Fabrice Bellard
  * Copyright (c) 2014 Red Hat Inc.
@@ -XXX,XX +XXX,XX @@
 #include "hw/boards.h"
 
 #include "tcg-cpus.h"
+#include "tcg-cpus-mttcg.h"
+#include "tcg-cpus-rr.h"
 
-/* Kick all RR vCPUs */
-static void qemu_cpu_kick_rr_cpus(void)
-{
-    CPUState *cpu;
+/* common functionality among all TCG variants */
 
-    CPU_FOREACH(cpu) {
-        cpu_exit(cpu);
-    };
-}
-
-static void tcg_kick_vcpu_thread(CPUState *cpu)
-{
-    if (qemu_tcg_mttcg_enabled()) {
-        cpu_exit(cpu);
-    } else {
-        qemu_cpu_kick_rr_cpus();
-    }
-}
-
-/*
- * TCG vCPU kick timer
- *
- * The kick timer is responsible for moving single threaded vCPU
- * emulation on to the next vCPU. If more than one vCPU is running a
- * timer event with force a cpu->exit so the next vCPU can get
- * scheduled.
- *
- * The timer is removed if all vCPUs are idle and restarted again once
- * idleness is complete.
- */
-
-static QEMUTimer *tcg_kick_vcpu_timer;
-static CPUState *tcg_current_rr_cpu;
-
-#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
-
-static inline int64_t qemu_tcg_next_kick(void)
-{
-    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
-}
-
-/* Kick the currently round-robin scheduled vCPU to next */
-static void qemu_cpu_kick_rr_next_cpu(void)
-{
-    CPUState *cpu;
-    do {
-        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
-        if (cpu) {
-            cpu_exit(cpu);
-        }
-    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
-}
-
-static void kick_tcg_thread(void *opaque)
-{
-    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-    qemu_cpu_kick_rr_next_cpu();
-}
-
-static void start_tcg_kick_timer(void)
-{
-    assert(!mttcg_enabled);
-    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
-        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
-                                           kick_tcg_thread, NULL);
-    }
-    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
-        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-    }
-}
-
-static void stop_tcg_kick_timer(void)
-{
-    assert(!mttcg_enabled);
-    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
-        timer_del(tcg_kick_vcpu_timer);
-    }
-}
-
-static void qemu_tcg_destroy_vcpu(CPUState *cpu)
-{
-}
-
-static void qemu_tcg_rr_wait_io_event(void)
-{
-    CPUState *cpu;
-
-    while (all_cpu_threads_idle()) {
-        stop_tcg_kick_timer();
-        qemu_cond_wait_iothread(first_cpu->halt_cond);
-    }
-
-    start_tcg_kick_timer();
-
-    CPU_FOREACH(cpu) {
-        qemu_wait_io_event_common(cpu);
-    }
-}
-
-static int64_t tcg_get_icount_limit(void)
-{
-    int64_t deadline;
-
-    if (replay_mode != REPLAY_MODE_PLAY) {
-        /*
-         * Include all the timers, because they may need an attention.
-         * Too long CPU execution may create unnecessary delay in UI.
-         */
-        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
-                                              QEMU_TIMER_ATTR_ALL);
-        /* Check realtime timers, because they help with input processing */
-        deadline = qemu_soonest_timeout(deadline,
-                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
-                                           QEMU_TIMER_ATTR_ALL));
-
-        /*
-         * Maintain prior (possibly buggy) behaviour where if no deadline
-         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
-         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
-         * nanoseconds.
-         */
-        if ((deadline < 0) || (deadline > INT32_MAX)) {
-            deadline = INT32_MAX;
-        }
-
-        return icount_round(deadline);
-    } else {
-        return replay_get_instructions();
-    }
-}
-
-static void notify_aio_contexts(void)
-{
-    /* Wake up other AioContexts.  */
-    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
-    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
-}
-
-static void handle_icount_deadline(void)
-{
-    assert(qemu_in_vcpu_thread());
-    if (icount_enabled()) {
-        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
-                                                      QEMU_TIMER_ATTR_ALL);
-
-        if (deadline == 0) {
-            notify_aio_contexts();
-        }
-    }
-}
-
-static void prepare_icount_for_run(CPUState *cpu)
-{
-    if (icount_enabled()) {
-        int insns_left;
-
-        /*
-         * These should always be cleared by process_icount_data after
-         * each vCPU execution. However u16.high can be raised
-         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
-         */
-        g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
-        g_assert(cpu->icount_extra == 0);
-
-        cpu->icount_budget = tcg_get_icount_limit();
-        insns_left = MIN(0xffff, cpu->icount_budget);
-        cpu_neg(cpu)->icount_decr.u16.low = insns_left;
-        cpu->icount_extra = cpu->icount_budget - insns_left;
-
-        replay_mutex_lock();
-
-        if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
-            notify_aio_contexts();
-        }
-    }
-}
-
-static void process_icount_data(CPUState *cpu)
-{
-    if (icount_enabled()) {
-        /* Account for executed instructions */
-        icount_update(cpu);
-
-        /* Reset the counters */
-        cpu_neg(cpu)->icount_decr.u16.low = 0;
-        cpu->icount_extra = 0;
-        cpu->icount_budget = 0;
-
-        replay_account_executed_instructions();
-
-        replay_mutex_unlock();
-    }
-}
-
-static int tcg_cpu_exec(CPUState *cpu)
-{
-    int ret;
-#ifdef CONFIG_PROFILER
-    int64_t ti;
-#endif
-
-    assert(tcg_enabled());
-#ifdef CONFIG_PROFILER
-    ti = profile_getclock();
-#endif
-    cpu_exec_start(cpu);
-    ret = cpu_exec(cpu);
-    cpu_exec_end(cpu);
-#ifdef CONFIG_PROFILER
-    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
-                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
-#endif
-    return ret;
-}
-
-/*
- * Destroy any remaining vCPUs which have been unplugged and have
- * finished running
- */
-static void deal_with_unplugged_cpus(void)
-{
-    CPUState *cpu;
-
-    CPU_FOREACH(cpu) {
-        if (cpu->unplug && !cpu_can_run(cpu)) {
-            qemu_tcg_destroy_vcpu(cpu);
-            cpu_thread_signal_destroyed(cpu);
-            break;
-        }
-    }
-}
-
-/*
- * Single-threaded TCG
- *
- * In the single-threaded case each vCPU is simulated in turn. If
- * there is more than a single vCPU we create a simple timer to kick
- * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
- * This is done explicitly rather than relying on side-effects
- * elsewhere.
- */
-
-static void *tcg_rr_cpu_thread_fn(void *arg)
-{
-    CPUState *cpu = arg;
-
-    assert(tcg_enabled());
-    rcu_register_thread();
-    tcg_register_thread();
-
-    qemu_mutex_lock_iothread();
-    qemu_thread_get_self(cpu->thread);
-
-    cpu->thread_id = qemu_get_thread_id();
-    cpu->can_do_io = 1;
-    cpu_thread_signal_created(cpu);
-    qemu_guest_random_seed_thread_part2(cpu->random_seed);
-
-    /* wait for initial kick-off after machine start */
-    while (first_cpu->stopped) {
-        qemu_cond_wait_iothread(first_cpu->halt_cond);
-
-        /* process any pending work */
-        CPU_FOREACH(cpu) {
-            current_cpu = cpu;
-            qemu_wait_io_event_common(cpu);
-        }
-    }
-
-    start_tcg_kick_timer();
-
-    cpu = first_cpu;
-
-    /* process any pending work */
-    cpu->exit_request = 1;
-
-    while (1) {
-        qemu_mutex_unlock_iothread();
-        replay_mutex_lock();
-        qemu_mutex_lock_iothread();
-        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
-        icount_account_warp_timer();
-
-        /*
-         * Run the timers here.  This is much more efficient than
-         * waking up the I/O thread and waiting for completion.
-         */
-        handle_icount_deadline();
-
-        replay_mutex_unlock();
-
-        if (!cpu) {
-            cpu = first_cpu;
-        }
-
-        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
-
-            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
-            current_cpu = cpu;
-
-            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
-                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
-
-            if (cpu_can_run(cpu)) {
-                int r;
-
-                qemu_mutex_unlock_iothread();
-                prepare_icount_for_run(cpu);
-
-                r = tcg_cpu_exec(cpu);
-
-                process_icount_data(cpu);
-                qemu_mutex_lock_iothread();
-
-                if (r == EXCP_DEBUG) {
-                    cpu_handle_guest_debug(cpu);
-                    break;
-                } else if (r == EXCP_ATOMIC) {
-                    qemu_mutex_unlock_iothread();
-                    cpu_exec_step_atomic(cpu);
-                    qemu_mutex_lock_iothread();
-                    break;
-                }
-            } else if (cpu->stop) {
-                if (cpu->unplug) {
-                    cpu = CPU_NEXT(cpu);
-                }
-                break;
-            }
-
-            cpu = CPU_NEXT(cpu);
-        } /* while (cpu && !cpu->exit_request).. */
-
-        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
-        qatomic_set(&tcg_current_rr_cpu, NULL);
-
-        if (cpu && cpu->exit_request) {
-            qatomic_mb_set(&cpu->exit_request, 0);
-        }
-
-        if (icount_enabled() && all_cpu_threads_idle()) {
-            /*
-             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
-             * in the main_loop, wake it up in order to start the warp timer.
-             */
-            qemu_notify_event();
-        }
-
-        qemu_tcg_rr_wait_io_event();
-        deal_with_unplugged_cpus();
-    }
-
-    rcu_unregister_thread();
-    return NULL;
-}
-
-/*
- * Multi-threaded TCG
- *
- * In the multi-threaded case each vCPU has its own thread. The TLS
- * variable current_cpu can be used deep in the code to find the
- * current CPUState for a given thread.
- */
-
-static void *tcg_cpu_thread_fn(void *arg)
-{
-    CPUState *cpu = arg;
-
-    assert(tcg_enabled());
-    g_assert(!icount_enabled());
-
-    rcu_register_thread();
-    tcg_register_thread();
-
-    qemu_mutex_lock_iothread();
-    qemu_thread_get_self(cpu->thread);
-
-    cpu->thread_id = qemu_get_thread_id();
-    cpu->can_do_io = 1;
-    current_cpu = cpu;
-    cpu_thread_signal_created(cpu);
-    qemu_guest_random_seed_thread_part2(cpu->random_seed);
-
-    /* process any pending work */
-    cpu->exit_request = 1;
-
-    do {
-        if (cpu_can_run(cpu)) {
-            int r;
-            qemu_mutex_unlock_iothread();
-            r = tcg_cpu_exec(cpu);
-            qemu_mutex_lock_iothread();
-            switch (r) {
-            case EXCP_DEBUG:
-                cpu_handle_guest_debug(cpu);
-                break;
-            case EXCP_HALTED:
-                /*
-                 * during start-up the vCPU is reset and the thread is
-                 * kicked several times. If we don't ensure we go back
-                 * to sleep in the halted state we won't cleanly
-                 * start-up when the vCPU is enabled.
-                 *
-                 * cpu->halted should ensure we sleep in wait_io_event
-                 */
-                g_assert(cpu->halted);
-                break;
-            case EXCP_ATOMIC:
-                qemu_mutex_unlock_iothread();
-                cpu_exec_step_atomic(cpu);
-                qemu_mutex_lock_iothread();
-            default:
-                /* Ignore everything else? */
-                break;
-            }
-        }
-
-        qatomic_mb_set(&cpu->exit_request, 0);
-        qemu_wait_io_event(cpu);
-    } while (!cpu->unplug || cpu_can_run(cpu));
-
-    qemu_tcg_destroy_vcpu(cpu);
-    cpu_thread_signal_destroyed(cpu);
-    qemu_mutex_unlock_iothread();
-    rcu_unregister_thread();
-    return NULL;
-}
-
-static void tcg_start_vcpu_thread(CPUState *cpu)
+void tcg_start_vcpu_thread(CPUState *cpu)
 {
     char thread_name[VCPU_THREAD_NAME_SIZE];
     static QemuCond *single_tcg_halt_cond;
@@ -XXX,XX +XXX,XX @@ static void tcg_start_vcpu_thread(CPUState *cpu)
     }
 }
 
-static int64_t tcg_get_virtual_clock(void)
+void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
-    if (icount_enabled()) {
-        return icount_get();
-    }
-    return cpu_get_clock();
+    cpu_thread_signal_destroyed(cpu);
 }
 
-static int64_t tcg_get_elapsed_ticks(void)
+int tcg_cpu_exec(CPUState *cpu)
 {
-    if (icount_enabled()) {
-        return icount_get();
-    }
-    return cpu_get_ticks();
+    int ret;
+#ifdef CONFIG_PROFILER
+    int64_t ti;
+#endif
+    assert(tcg_enabled());
+#ifdef CONFIG_PROFILER
+    ti = profile_getclock();
+#endif
+    cpu_exec_start(cpu);
+    ret = cpu_exec(cpu);
+    cpu_exec_end(cpu);
+#ifdef CONFIG_PROFILER
+    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
+                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
+#endif
+    return ret;
 }
 
 /* mask must never be zero, except for A20 change call */
-static void tcg_handle_interrupt(CPUState *cpu, int mask)
+void tcg_handle_interrupt(CPUState *cpu, int mask)
 {
-    int old_mask;
     g_assert(qemu_mutex_iothread_locked());
 
-    old_mask = cpu->interrupt_request;
     cpu->interrupt_request |= mask;
 
     /*
@@ -XXX,XX +XXX,XX @@ static void tcg_handle_interrupt(CPUState *cpu, int mask)
         qemu_cpu_kick(cpu);
     } else {
         qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
-        if (icount_enabled() &&
-            !cpu->can_do_io
-            && (mask & ~old_mask) != 0) {
-            cpu_abort(cpu, "Raised interrupt while not in I/O function");
-        }
     }
 }
-
-const CpusAccel tcg_cpus = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
-    .kick_vcpu_thread = tcg_kick_vcpu_thread,
-
-    .handle_interrupt = tcg_handle_interrupt,
-
-    .get_virtual_clock = tcg_get_virtual_clock,
-    .get_elapsed_ticks = tcg_get_elapsed_ticks,
-};
diff --git a/softmmu/icount.c b/softmmu/icount.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/icount.c
+++ b/softmmu/icount.c
@@ -XXX,XX +XXX,XX @@ void icount_start_warp_timer(void)
 
 void icount_account_warp_timer(void)
 {
-    if (!icount_enabled() || !icount_sleep) {
+    if (!icount_sleep) {
         return;
     }
 
diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/meson.build
+++ b/accel/tcg/meson.build
@@ -XXX,XX +XXX,XX @@ tcg_ss.add(when: 'CONFIG_SOFTMMU', if_false: files('user-exec-stub.c'))
 tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c'), libdl])
 specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
 
-specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files('tcg-all.c', 'cputlb.c', 'tcg-cpus.c'))
+specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
+  'tcg-all.c',
+  'cputlb.c',
+  'tcg-cpus.c',
+  'tcg-cpus-mttcg.c',
+  'tcg-cpus-icount.c',
+  'tcg-cpus-rr.c'
+))
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

after the initial split into 3 tcg variants, we proceed to also
split tcg_start_vcpu_thread.

We actually split it in 2 this time, since the icount variant
just uses the round robin function.

Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Message-Id: <20201015143217.29337-3-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-cpus-mttcg.h  | 21 --------------
 accel/tcg/tcg-cpus-rr.h     |  3 +-
 accel/tcg/tcg-cpus.h        |  1 -
 accel/tcg/tcg-all.c         |  5 ++++
 accel/tcg/tcg-cpus-icount.c |  2 +-
 accel/tcg/tcg-cpus-mttcg.c  | 29 +++++++++++++++++--
 accel/tcg/tcg-cpus-rr.c     | 39 +++++++++++++++++++++++--
 accel/tcg/tcg-cpus.c        | 58 -------------------------------------
 8 files changed, 71 insertions(+), 87 deletions(-)
 delete mode 100644 accel/tcg/tcg-cpus-mttcg.h

diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/accel/tcg/tcg-cpus-mttcg.h
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-/*
- * QEMU TCG Multi Threaded vCPUs implementation
- *
- * Copyright 2020 SUSE LLC
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- */
-
-#ifndef TCG_CPUS_MTTCG_H
-#define TCG_CPUS_MTTCG_H
-
-/*
- * In the multi-threaded case each vCPU has its own thread. The TLS
- * variable current_cpu can be used deep in the code to find the
- * current CPUState for a given thread.
- */
-
-void *tcg_cpu_thread_fn(void *arg);
-
-#endif /* TCG_CPUS_MTTCG_H */
diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.h
+++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
 /* Kick all RR vCPUs. */
 void qemu_cpu_kick_rr_cpus(CPUState *unused);
 
-void *tcg_rr_cpu_thread_fn(void *arg);
+/* start the round robin vcpu thread */
+void rr_start_vcpu_thread(CPUState *cpu);
 
 #endif /* TCG_CPUS_RR_H */
diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.h
+++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
 extern const CpusAccel tcg_cpus_icount;
 extern const CpusAccel tcg_cpus_rr;
 
-void tcg_start_vcpu_thread(CPUState *cpu);
 void qemu_tcg_destroy_vcpu(CPUState *cpu);
 int tcg_cpu_exec(CPUState *cpu);
 void tcg_handle_interrupt(CPUState *cpu, int mask);
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
     tcg_exec_init(s->tb_size * 1024 * 1024);
     mttcg_enabled = s->mttcg_enabled;
 
+    /*
+     * Initialize TCG regions
+     */
+    tcg_region_init();
+
     if (mttcg_enabled) {
         cpus_register_accel(&tcg_cpus_mttcg);
     } else if (icount_enabled()) {
diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.c
+++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
 }
 
 const CpusAccel tcg_cpus_icount = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .create_vcpu_thread = rr_start_vcpu_thread,
     .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 
     .handle_interrupt = icount_handle_interrupt,
diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-mttcg.c
+++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/boards.h"
 
 #include "tcg-cpus.h"
-#include "tcg-cpus-mttcg.h"
 
 /*
  * In the multi-threaded case each vCPU has its own thread. The TLS
@@ -XXX,XX +XXX,XX @@
  * current CPUState for a given thread.
  */
 
-void *tcg_cpu_thread_fn(void *arg)
+static void *tcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ static void mttcg_kick_vcpu_thread(CPUState *cpu)
     cpu_exit(cpu);
 }
 
+static void mttcg_start_vcpu_thread(CPUState *cpu)
+{
+    char thread_name[VCPU_THREAD_NAME_SIZE];
+
+    g_assert(tcg_enabled());
+
+    parallel_cpus = (current_machine->smp.max_cpus > 1);
+
+    cpu->thread = g_malloc0(sizeof(QemuThread));
+    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
+    qemu_cond_init(cpu->halt_cond);
+
+    /* create a thread per vCPU with TCG (MTTCG) */
+    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
+             cpu->cpu_index);
+
+    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
+                       cpu, QEMU_THREAD_JOINABLE);
+
+#ifdef _WIN32
+    cpu->hThread = qemu_thread_get_handle(cpu->thread);
+#endif
+}
+
 const CpusAccel tcg_cpus_mttcg = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .create_vcpu_thread = mttcg_start_vcpu_thread,
     .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 
     .handle_interrupt = tcg_handle_interrupt,
diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.c
+++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
  * elsewhere.
  */
 
-void *tcg_rr_cpu_thread_fn(void *arg)
+static void *tcg_rr_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ void *tcg_rr_cpu_thread_fn(void *arg)
     return NULL;
 }
 
+void rr_start_vcpu_thread(CPUState *cpu)
+{
+    char thread_name[VCPU_THREAD_NAME_SIZE];
+    static QemuCond *single_tcg_halt_cond;
+    static QemuThread *single_tcg_cpu_thread;
+
+    g_assert(tcg_enabled());
+    parallel_cpus = false;
+
+    if (!single_tcg_cpu_thread) {
+        cpu->thread = g_malloc0(sizeof(QemuThread));
+        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
+        qemu_cond_init(cpu->halt_cond);
+
+        /* share a single thread for all cpus with TCG */
+        snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
+        qemu_thread_create(cpu->thread, thread_name,
+                           tcg_rr_cpu_thread_fn,
+                           cpu, QEMU_THREAD_JOINABLE);
+
+        single_tcg_halt_cond = cpu->halt_cond;
+        single_tcg_cpu_thread = cpu->thread;
+#ifdef _WIN32
+        cpu->hThread = qemu_thread_get_handle(cpu->thread);
+#endif
+    } else {
+        /* we share the thread */
+        cpu->thread = single_tcg_cpu_thread;
+        cpu->halt_cond = single_tcg_halt_cond;
+        cpu->thread_id = first_cpu->thread_id;
+        cpu->can_do_io = 1;
+        cpu->created = true;
+    }
+}
+
 const CpusAccel tcg_cpus_rr = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .create_vcpu_thread = rr_start_vcpu_thread,
     .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 
     .handle_interrupt = tcg_handle_interrupt,
diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.c
+++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/boards.h"
 
 #include "tcg-cpus.h"
-#include "tcg-cpus-mttcg.h"
-#include "tcg-cpus-rr.h"
 
 /* common functionality among all TCG variants */
 
-void tcg_start_vcpu_thread(CPUState *cpu)
-{
-    char thread_name[VCPU_THREAD_NAME_SIZE];
-    static QemuCond *single_tcg_halt_cond;
-    static QemuThread *single_tcg_cpu_thread;
-    static int tcg_region_inited;
-
-    assert(tcg_enabled());
-    /*
-     * Initialize TCG regions--once. Now is a good time, because:
-     * (1) TCG's init context, prologue and target globals have been set up.
-     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
-     *     -accel flag is processed, so the check doesn't work then).
-     */
-    if (!tcg_region_inited) {
-        tcg_region_inited = 1;
-        tcg_region_init();
-        parallel_cpus = qemu_tcg_mttcg_enabled() && current_machine->smp.max_cpus > 1;
-    }
-
-    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
-        cpu->thread = g_malloc0(sizeof(QemuThread));
-        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
-        qemu_cond_init(cpu->halt_cond);
-
-        if (qemu_tcg_mttcg_enabled()) {
-            /* create a thread per vCPU with TCG (MTTCG) */
-            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
-                 cpu->cpu_index);
-
-            qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
-                               cpu, QEMU_THREAD_JOINABLE);
-
-        } else {
-            /* share a single thread for all cpus with TCG */
-            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
-            qemu_thread_create(cpu->thread, thread_name,
-                               tcg_rr_cpu_thread_fn,
-                               cpu, QEMU_THREAD_JOINABLE);
-
-            single_tcg_halt_cond = cpu->halt_cond;
-            single_tcg_cpu_thread = cpu->thread;
-        }
-#ifdef _WIN32
-        cpu->hThread = qemu_thread_get_handle(cpu->thread);
-#endif
-    } else {
-        /* For non-MTTCG cases we share the thread */
-        cpu->thread = single_tcg_cpu_thread;
-        cpu->halt_cond = single_tcg_halt_cond;
-        cpu->thread_id = first_cpu->thread_id;
-        cpu->can_do_io = 1;
-        cpu->created = true;
-    }
-}
-
 void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
     cpu_thread_signal_destroyed(cpu);
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20201015143217.29337-4-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-cpus-icount.h |  6 +--
 accel/tcg/tcg-cpus-rr.h     |  2 +-
 accel/tcg/tcg-cpus.h        |  6 +--
 accel/tcg/tcg-cpus-icount.c | 24 ++++++------
 accel/tcg/tcg-cpus-mttcg.c  | 10 ++---
 accel/tcg/tcg-cpus-rr.c     | 74 ++++++++++++++++++-------------------
 accel/tcg/tcg-cpus.c        |  6 +--
 7 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.h
+++ b/accel/tcg/tcg-cpus-icount.h
@@ -XXX,XX +XXX,XX @@
 #ifndef TCG_CPUS_ICOUNT_H
 #define TCG_CPUS_ICOUNT_H
 
-void handle_icount_deadline(void);
-void prepare_icount_for_run(CPUState *cpu);
-void process_icount_data(CPUState *cpu);
+void icount_handle_deadline(void);
+void icount_prepare_for_run(CPUState *cpu);
+void icount_process_data(CPUState *cpu);
 
 #endif /* TCG_CPUS_ICOUNT_H */
diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.h
+++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 
 /* Kick all RR vCPUs. */
-void qemu_cpu_kick_rr_cpus(CPUState *unused);
+void rr_kick_vcpu_thread(CPUState *unused);
 
 /* start the round robin vcpu thread */
 void rr_start_vcpu_thread(CPUState *cpu);
diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.h
+++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
 extern const CpusAccel tcg_cpus_icount;
 extern const CpusAccel tcg_cpus_rr;
 
-void qemu_tcg_destroy_vcpu(CPUState *cpu);
-int tcg_cpu_exec(CPUState *cpu);
-void tcg_handle_interrupt(CPUState *cpu, int mask);
+void tcg_cpus_destroy(CPUState *cpu);
+int tcg_cpus_exec(CPUState *cpu);
+void tcg_cpus_handle_interrupt(CPUState *cpu, int mask);
 
 #endif /* TCG_CPUS_H */
diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.c
+++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg-cpus-icount.h"
 #include "tcg-cpus-rr.h"
 
-static int64_t tcg_get_icount_limit(void)
+static int64_t icount_get_limit(void)
 {
     int64_t deadline;
 
@@ -XXX,XX +XXX,XX @@ static int64_t tcg_get_icount_limit(void)
     }
 }
 
-static void notify_aio_contexts(void)
+static void icount_notify_aio_contexts(void)
 {
     /* Wake up other AioContexts.  */
     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
     qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 }
 
-void handle_icount_deadline(void)
+void icount_handle_deadline(void)
 {
     assert(qemu_in_vcpu_thread());
     int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
                                                   QEMU_TIMER_ATTR_ALL);
 
     if (deadline == 0) {
-        notify_aio_contexts();
+        icount_notify_aio_contexts();
     }
 }
 
-void prepare_icount_for_run(CPUState *cpu)
+void icount_prepare_for_run(CPUState *cpu)
 {
     int insns_left;
 
     /*
-     * These should always be cleared by process_icount_data after
+     * These should always be cleared by icount_process_data after
      * each vCPU execution. However u16.high can be raised
-     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
+     * asynchronously by cpu_exit/cpu_interrupt/tcg_cpus_handle_interrupt
      */
     g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
     g_assert(cpu->icount_extra == 0);
 
-    cpu->icount_budget = tcg_get_icount_limit();
+    cpu->icount_budget = icount_get_limit();
     insns_left = MIN(0xffff, cpu->icount_budget);
     cpu_neg(cpu)->icount_decr.u16.low = insns_left;
     cpu->icount_extra = cpu->icount_budget - insns_left;
@@ -XXX,XX +XXX,XX @@ void prepare_icount_for_run(CPUState *cpu)
     replay_mutex_lock();
 
     if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
-        notify_aio_contexts();
+        icount_notify_aio_contexts();
     }
 }
 
-void process_icount_data(CPUState *cpu)
+void icount_process_data(CPUState *cpu)
 {
     /* Account for executed instructions */
     icount_update(cpu);
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
 {
     int old_mask = cpu->interrupt_request;
 
-    tcg_handle_interrupt(cpu, mask);
+    tcg_cpus_handle_interrupt(cpu, mask);
     if (qemu_cpu_is_self(cpu) &&
         !cpu->can_do_io
         && (mask & ~old_mask) != 0) {
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
 
 const CpusAccel tcg_cpus_icount = {
     .create_vcpu_thread = rr_start_vcpu_thread,
-    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+    .kick_vcpu_thread = rr_kick_vcpu_thread,
 
     .handle_interrupt = icount_handle_interrupt,
     .get_virtual_clock = icount_get,
diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-mttcg.c
+++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
  * current CPUState for a given thread.
  */
 
-static void *tcg_cpu_thread_fn(void *arg)
+static void *mttcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
         if (cpu_can_run(cpu)) {
             int r;
             qemu_mutex_unlock_iothread();
-            r = tcg_cpu_exec(cpu);
+            r = tcg_cpus_exec(cpu);
             qemu_mutex_lock_iothread();
             switch (r) {
             case EXCP_DEBUG:
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
         qemu_wait_io_event(cpu);
     } while (!cpu->unplug || cpu_can_run(cpu));
 
-    qemu_tcg_destroy_vcpu(cpu);
+    tcg_cpus_destroy(cpu);
     qemu_mutex_unlock_iothread();
     rcu_unregister_thread();
     return NULL;
@@ -XXX,XX +XXX,XX @@ static void mttcg_start_vcpu_thread(CPUState *cpu)
     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
              cpu->cpu_index);
 
-    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
+    qemu_thread_create(cpu->thread, thread_name, mttcg_cpu_thread_fn,
                        cpu, QEMU_THREAD_JOINABLE);
 
 #ifdef _WIN32
@@ -XXX,XX +XXX,XX @@ const CpusAccel tcg_cpus_mttcg = {
     .create_vcpu_thread = mttcg_start_vcpu_thread,
     .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 
-    .handle_interrupt = tcg_handle_interrupt,
+    .handle_interrupt = tcg_cpus_handle_interrupt,
 };
diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.c
+++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg-cpus-icount.h"
 
 /* Kick all RR vCPUs */
-void qemu_cpu_kick_rr_cpus(CPUState *unused)
+void rr_kick_vcpu_thread(CPUState *unused)
 {
     CPUState *cpu;
 
@@ -XXX,XX +XXX,XX @@ void qemu_cpu_kick_rr_cpus(CPUState *unused)
  * idleness is complete.
  */
 
-static QEMUTimer *tcg_kick_vcpu_timer;
-static CPUState *tcg_current_rr_cpu;
+static QEMUTimer *rr_kick_vcpu_timer;
+static CPUState *rr_current_cpu;
 
 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 
-static inline int64_t qemu_tcg_next_kick(void)
+static inline int64_t rr_next_kick_time(void)
 {
     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 }
 
 /* Kick the currently round-robin scheduled vCPU to next */
-static void qemu_cpu_kick_rr_next_cpu(void)
+static void rr_kick_next_cpu(void)
 {
     CPUState *cpu;
     do {
-        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
+        cpu = qatomic_mb_read(&rr_current_cpu);
         if (cpu) {
             cpu_exit(cpu);
         }
-    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
+    } while (cpu != qatomic_mb_read(&rr_current_cpu));
 }
 
-static void kick_tcg_thread(void *opaque)
+static void rr_kick_thread(void *opaque)
 {
-    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-    qemu_cpu_kick_rr_next_cpu();
+    timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
+    rr_kick_next_cpu();
 }
 
-static void start_tcg_kick_timer(void)
+static void rr_start_kick_timer(void)
 {
-    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
-        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
-                                           kick_tcg_thread, NULL);
+    if (!rr_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
+        rr_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                                           rr_kick_thread, NULL);
     }
-    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
-        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    if (rr_kick_vcpu_timer && !timer_pending(rr_kick_vcpu_timer)) {
+        timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
     }
 }
 
-static void stop_tcg_kick_timer(void)
+static void rr_stop_kick_timer(void)
 {
-    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
-        timer_del(tcg_kick_vcpu_timer);
+    if (rr_kick_vcpu_timer && timer_pending(rr_kick_vcpu_timer)) {
+        timer_del(rr_kick_vcpu_timer);
     }
 }
 
-static void qemu_tcg_rr_wait_io_event(void)
+static void rr_wait_io_event(void)
 {
     CPUState *cpu;
 
     while (all_cpu_threads_idle()) {
-        stop_tcg_kick_timer();
+        rr_stop_kick_timer();
         qemu_cond_wait_iothread(first_cpu->halt_cond);
     }
 
-    start_tcg_kick_timer();
+    rr_start_kick_timer();
 
     CPU_FOREACH(cpu) {
         qemu_wait_io_event_common(cpu);
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_rr_wait_io_event(void)
  * Destroy any remaining vCPUs which have been unplugged and have
  * finished running
  */
-static void deal_with_unplugged_cpus(void)
+static void rr_deal_with_unplugged_cpus(void)
 {
     CPUState *cpu;
 
     CPU_FOREACH(cpu) {
         if (cpu->unplug && !cpu_can_run(cpu)) {
-            qemu_tcg_destroy_vcpu(cpu);
+            tcg_cpus_destroy(cpu);
             break;
         }
     }
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
  * elsewhere.
  */
 
-static void *tcg_rr_cpu_thread_fn(void *arg)
+static void *rr_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
         }
     }
 
-    start_tcg_kick_timer();
+    rr_start_kick_timer();
 
     cpu = first_cpu;
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
              * Run the timers here.  This is much more efficient than
              * waking up the I/O thread and waiting for completion.
              */
-            handle_icount_deadline();
+            icount_handle_deadline();
         }
 
         replay_mutex_unlock();
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
 
         while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 
-            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
+            qatomic_mb_set(&rr_current_cpu, cpu);
             current_cpu = cpu;
 
             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
 
                 qemu_mutex_unlock_iothread();
                 if (icount_enabled()) {
-                    prepare_icount_for_run(cpu);
+                    icount_prepare_for_run(cpu);
                 }
-                r = tcg_cpu_exec(cpu);
+                r = tcg_cpus_exec(cpu);
                 if (icount_enabled()) {
-                    process_icount_data(cpu);
+                    icount_process_data(cpu);
                 }
                 qemu_mutex_lock_iothread();
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
         } /* while (cpu && !cpu->exit_request).. */
 
         /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
-        qatomic_set(&tcg_current_rr_cpu, NULL);
+        qatomic_set(&rr_current_cpu, NULL);
 
         if (cpu && cpu->exit_request) {
             qatomic_mb_set(&cpu->exit_request, 0);
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
             qemu_notify_event();
         }
 
-        qemu_tcg_rr_wait_io_event();
-        deal_with_unplugged_cpus();
+        rr_wait_io_event();
+        rr_deal_with_unplugged_cpus();
     }
 
     rcu_unregister_thread();
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
         /* share a single thread for all cpus with TCG */
         snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
         qemu_thread_create(cpu->thread, thread_name,
-                           tcg_rr_cpu_thread_fn,
+                           rr_cpu_thread_fn,
                            cpu, QEMU_THREAD_JOINABLE);
 
         single_tcg_halt_cond = cpu->halt_cond;
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
 
 const CpusAccel tcg_cpus_rr = {
     .create_vcpu_thread = rr_start_vcpu_thread,
-    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+    .kick_vcpu_thread = rr_kick_vcpu_thread,
 
-    .handle_interrupt = tcg_handle_interrupt,
+    .handle_interrupt = tcg_cpus_handle_interrupt,
 };
diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.c
+++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
 
 /* common functionality among all TCG variants */
 
-void qemu_tcg_destroy_vcpu(CPUState *cpu)
+void tcg_cpus_destroy(CPUState *cpu)
 {
     cpu_thread_signal_destroyed(cpu);
 }
 
-int tcg_cpu_exec(CPUState *cpu)
+int tcg_cpus_exec(CPUState *cpu)
 {
     int ret;
 #ifdef CONFIG_PROFILER
@@ -XXX,XX +XXX,XX @@ int tcg_cpu_exec(CPUState *cpu)
 }
 
 /* mask must never be zero, except for A20 change call */
-void tcg_handle_interrupt(CPUState *cpu, int mask)
+void tcg_cpus_handle_interrupt(CPUState *cpu, int mask)
 {
     g_assert(qemu_mutex_iothread_locked());
 
-- 
2.25.1

Pulling together some cleanups, fixes, and prepatory tci stuff.
Most of this has been reviewed, but not all.

Those lacking review:

01-tcg-aarch64-Fix-constant-subtraction-in-tcg_out_adds.patch
02-tcg-aarch64-Fix-I3617_CMLE0.patch
03-tcg-aarch64-Fix-generation-of-scalar-vector-operatio.patch
04-tcg-tci-Use-exec-cpu_ldst.h-interfaces.patch
06-tcg-Manage-splitwx-in-tc_ptr_to_region_tree-by-hand.patch
23-accel-tcg-rename-tb_lookup__cpu_state-and-hoist-stat.patch
24-accel-tcg-move-CF_CLUSTER-calculation-to-curr_cflags.patch
25-accel-tcg-drop-the-use-of-CF_HASH_MASK-and-rename-pa.patch
26-include-exec-lightly-re-arrange-TranslationBlock.patch
27-accel-tcg-Precompute-curr_cflags-into-cpu-tcg_cflags.patch

Alex, the last patch is a re-write and extension of one that
you did review.

Alex Bennée (4):
  accel/tcg: rename tb_lookup__cpu_state and hoist state extraction
  accel/tcg: move CF_CLUSTER calculation to curr_cflags
  accel/tcg: drop the use of CF_HASH_MASK and rename params
  include/exec: lightly re-arrange TranslationBlock

Richard Henderson (23):
  tcg/aarch64: Fix constant subtraction in tcg_out_addsub2
  tcg/aarch64: Fix I3617_CMLE0
  tcg/aarch64: Fix generation of "scalar" vector operations
  tcg/tci: Use exec/cpu_ldst.h interfaces
  tcg: Split out tcg_raise_tb_overflow
  tcg: Manage splitwx in tc_ptr_to_region_tree by hand
  tcg/tci: Merge identical cases in generation (arithmetic opcodes)
  tcg/tci: Merge identical cases in generation (exchange opcodes)
  tcg/tci: Merge identical cases in generation (deposit opcode)
  tcg/tci: Merge identical cases in generation (conditional opcodes)
  tcg/tci: Merge identical cases in generation (load/store opcodes)
  tcg/tci: Remove tci_read_r8
  tcg/tci: Remove tci_read_r8s
  tcg/tci: Remove tci_read_r16
  tcg/tci: Remove tci_read_r16s
  tcg/tci: Remove tci_read_r32
  tcg/tci: Remove tci_read_r32s
  tcg/tci: Reduce use of tci_read_r64
  tcg/tci: Merge basic arithmetic operations
  tcg/tci: Merge extension operations
  tcg/tci: Merge bswap operations
  tcg/tci: Merge mov, not and neg operations
  accel/tcg: Precompute curr_cflags into cpu->tcg_cflags

-- 
2.25.1

An hppa guest executing

0x000000000000e05c:  ldil L%10000,r4
0x000000000000e060:  ldo 0(r4),r4
0x000000000000e064:  sub r3,r4,sp

produces

---- 000000000000e064 000000000000e068
 sub2_i32 tmp0,tmp4,r3,$0x1,$0x10000,$0x0

after folding and constant propagation.  Then we hit

tcg-target.c.inc:640: tcg_out_insn_3401: Assertion `aimm <= 0xfff' failed.

because aimm is in fact -16, but unsigned.

The ((bl < 0) ^ sub) condition which negates bl is incorrect and will
always lead to this abort.  If the constant is positive, sub will make
it negative; if the constant is negative, sub will keep it negative.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.c.inc | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_addsubi(TCGContext *s, int ext, TCGReg rd,
     }
 }
 
-static inline void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
-                                   TCGReg rh, TCGReg al, TCGReg ah,
-                                   tcg_target_long bl, tcg_target_long bh,
-                                   bool const_bl, bool const_bh, bool sub)
+static void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
+                            TCGReg rh, TCGReg al, TCGReg ah,
+                            tcg_target_long bl, tcg_target_long bh,
+                            bool const_bl, bool const_bh, bool sub)
 {
     TCGReg orig_rl = rl;
     AArch64Insn insn;
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
     }
 
     if (const_bl) {
-        insn = I3401_ADDSI;
-        if ((bl < 0) ^ sub) {
-            insn = I3401_SUBSI;
+        if (bl < 0) {
             bl = -bl;
+            insn = sub ? I3401_ADDSI : I3401_SUBSI;
+        } else {
+            insn = sub ? I3401_SUBSI : I3401_ADDSI;
         }
+
         if (unlikely(al == TCG_REG_XZR)) {
             /* ??? We want to allow al to be zero for the benefit of
                negation via subtraction.  However, that leaves open the
-- 
2.25.1

For some vector operations, "1D" is not a valid type, and there
are separate instructions for the 64-bit scalar operation.

Tested-by: Stefan Weil <sw@weilnetz.de>
Buglink: https://bugs.launchpad.net/qemu/+bug/1916112
Fixes: 14e4c1e2355 ("tcg/aarch64: Add vector operations")
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.c.inc | 211 ++++++++++++++++++++++++++++++-----
 1 file changed, 181 insertions(+), 30 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     I3606_BIC       = 0x2f001400,
     I3606_ORR       = 0x0f001400,
 
+    /* AdvSIMD scalar shift by immediate */
+    I3609_SSHR      = 0x5f000400,
+    I3609_SSRA      = 0x5f001400,
+    I3609_SHL       = 0x5f005400,
+    I3609_USHR      = 0x7f000400,
+    I3609_USRA      = 0x7f001400,
+    I3609_SLI       = 0x7f005400,
+
+    /* AdvSIMD scalar three same */
+    I3611_SQADD     = 0x5e200c00,
+    I3611_SQSUB     = 0x5e202c00,
+    I3611_CMGT      = 0x5e203400,
+    I3611_CMGE      = 0x5e203c00,
+    I3611_SSHL      = 0x5e204400,
+    I3611_ADD       = 0x5e208400,
+    I3611_CMTST     = 0x5e208c00,
+    I3611_UQADD     = 0x7e200c00,
+    I3611_UQSUB     = 0x7e202c00,
+    I3611_CMHI      = 0x7e203400,
+    I3611_CMHS      = 0x7e203c00,
+    I3611_USHL      = 0x7e204400,
+    I3611_SUB       = 0x7e208400,
+    I3611_CMEQ      = 0x7e208c00,
+
+    /* AdvSIMD scalar two-reg misc */
+    I3612_CMGT0     = 0x5e208800,
+    I3612_CMEQ0     = 0x5e209800,
+    I3612_CMLT0     = 0x5e20a800,
+    I3612_ABS       = 0x5e20b800,
+    I3612_CMGE0     = 0x7e208800,
+    I3612_CMLE0     = 0x7e209800,
+    I3612_NEG       = 0x7e20b800,
+
     /* AdvSIMD shift by immediate */
     I3614_SSHR      = 0x0f000400,
     I3614_SSRA      = 0x0f001400,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_insn_3606(TCGContext *s, AArch64Insn insn, bool q,
               | (imm8 & 0xe0) << (16 - 5) | (imm8 & 0x1f) << 5);
 }
 
+static void tcg_out_insn_3609(TCGContext *s, AArch64Insn insn,
+                              TCGReg rd, TCGReg rn, unsigned immhb)
+{
+    tcg_out32(s, insn | immhb << 16 | (rn & 0x1f) << 5 | (rd & 0x1f));
+}
+
+static void tcg_out_insn_3611(TCGContext *s, AArch64Insn insn,
+                              unsigned size, TCGReg rd, TCGReg rn, TCGReg rm)
+{
+    tcg_out32(s, insn | (size << 22) | (rm & 0x1f) << 16
+              | (rn & 0x1f) << 5 | (rd & 0x1f));
+}
+
+static void tcg_out_insn_3612(TCGContext *s, AArch64Insn insn,
+                              unsigned size, TCGReg rd, TCGReg rn)
+{
+    tcg_out32(s, insn | (size << 22) | (rn & 0x1f) << 5 | (rd & 0x1f));
+}
+
 static void tcg_out_insn_3614(TCGContext *s, AArch64Insn insn, bool q,
                               TCGReg rd, TCGReg rn, unsigned immhb)
 {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
                            unsigned vecl, unsigned vece,
                            const TCGArg *args, const int *const_args)
 {
-    static const AArch64Insn cmp_insn[16] = {
+    static const AArch64Insn cmp_vec_insn[16] = {
         [TCG_COND_EQ] = I3616_CMEQ,
         [TCG_COND_GT] = I3616_CMGT,
         [TCG_COND_GE] = I3616_CMGE,
         [TCG_COND_GTU] = I3616_CMHI,
         [TCG_COND_GEU] = I3616_CMHS,
     };
-    static const AArch64Insn cmp0_insn[16] = {
+    static const AArch64Insn cmp_scalar_insn[16] = {
+        [TCG_COND_EQ] = I3611_CMEQ,
+        [TCG_COND_GT] = I3611_CMGT,
+        [TCG_COND_GE] = I3611_CMGE,
+        [TCG_COND_GTU] = I3611_CMHI,
+        [TCG_COND_GEU] = I3611_CMHS,
+    };
+    static const AArch64Insn cmp0_vec_insn[16] = {
         [TCG_COND_EQ] = I3617_CMEQ0,
         [TCG_COND_GT] = I3617_CMGT0,
         [TCG_COND_GE] = I3617_CMGE0,
         [TCG_COND_LT] = I3617_CMLT0,
         [TCG_COND_LE] = I3617_CMLE0,
     };
+    static const AArch64Insn cmp0_scalar_insn[16] = {
+        [TCG_COND_EQ] = I3612_CMEQ0,
+        [TCG_COND_GT] = I3612_CMGT0,
+        [TCG_COND_GE] = I3612_CMGE0,
+        [TCG_COND_LT] = I3612_CMLT0,
+        [TCG_COND_LE] = I3612_CMLE0,
+    };
 
     TCGType type = vecl + TCG_TYPE_V64;
     unsigned is_q = vecl;
+    bool is_scalar = !is_q && vece == MO_64;
     TCGArg a0, a1, a2, a3;
     int cmode, imm8;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
         break;
     case INDEX_op_add_vec:
-        tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, ADD, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_sub_vec:
-        tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, SUB, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_mul_vec:
         tcg_out_insn(s, 3616, MUL, is_q, vece, a0, a1, a2);
         break;
     case INDEX_op_neg_vec:
-        tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
+        if (is_scalar) {
+            tcg_out_insn(s, 3612, NEG, vece, a0, a1);
+        } else {
+            tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
+        }
         break;
     case INDEX_op_abs_vec:
-        tcg_out_insn(s, 3617, ABS, is_q, vece, a0, a1);
+        if (is_scalar) {
+            tcg_out_insn(s, 3612, ABS, vece, a0, a1);
+        } else {
+            tcg_out_insn(s, 3617, ABS, is_q, vece, a0, a1);
+        }
         break;
     case INDEX_op_and_vec:
         if (const_args[2]) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         tcg_out_insn(s, 3616, EOR, is_q, 0, a0, a1, a2);
         break;
     case INDEX_op_ssadd_vec:
-        tcg_out_insn(s, 3616, SQADD, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, SQADD, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, SQADD, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_sssub_vec:
-        tcg_out_insn(s, 3616, SQSUB, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, SQSUB, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, SQSUB, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_usadd_vec:
-        tcg_out_insn(s, 3616, UQADD, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, UQADD, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, UQADD, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_ussub_vec:
-        tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, UQSUB, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_smax_vec:
         tcg_out_insn(s, 3616, SMAX, is_q, vece, a0, a1, a2);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1);
         break;
     case INDEX_op_shli_vec:
-        tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
+        if (is_scalar) {
+            tcg_out_insn(s, 3609, SHL, a0, a1, a2 + (8 << vece));
+        } else {
+            tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
+        }
         break;
     case INDEX_op_shri_vec:
-        tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3609, USHR, a0, a1, (16 << vece) - a2);
+        } else {
+            tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2);
+        }
         break;
     case INDEX_op_sari_vec:
-        tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3609, SSHR, a0, a1, (16 << vece) - a2);
+        } else {
+            tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
+        }
         break;
     case INDEX_op_aa64_sli_vec:
-        tcg_out_insn(s, 3614, SLI, is_q, a0, a2, args[3] + (8 << vece));
+        if (is_scalar) {
+            tcg_out_insn(s, 3609, SLI, a0, a2, args[3] + (8 << vece));
+        } else {
+            tcg_out_insn(s, 3614, SLI, is_q, a0, a2, args[3] + (8 << vece));
+        }
         break;
     case INDEX_op_shlv_vec:
-        tcg_out_insn(s, 3616, USHL, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, USHL, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, USHL, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_aa64_sshl_vec:
-        tcg_out_insn(s, 3616, SSHL, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, SSHL, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, SSHL, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_cmp_vec:
         {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
 
             if (cond == TCG_COND_NE) {
                 if (const_args[2]) {
-                    tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1);
+                    if (is_scalar) {
+                        tcg_out_insn(s, 3611, CMTST, vece, a0, a1, a1);
+                    } else {
+                        tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1);
+                    }
                 } else {
-                    tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2);
+                    if (is_scalar) {
+                        tcg_out_insn(s, 3611, CMEQ, vece, a0, a1, a2);
+                    } else {
+                        tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2);
+                    }
                     tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a0);
                 }
             } else {
                 if (const_args[2]) {
-                    insn = cmp0_insn[cond];
-                    if (insn) {
-                        tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
-                        break;
+                    if (is_scalar) {
+                        insn = cmp0_scalar_insn[cond];
+                        if (insn) {
+                            tcg_out_insn_3612(s, insn, vece, a0, a1);
+                            break;
+                        }
+                    } else {
+                        insn = cmp0_vec_insn[cond];
+                        if (insn) {
+                            tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
+                            break;
+                        }
                     }
                     tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
                     a2 = TCG_VEC_TMP;
                 }
-                insn = cmp_insn[cond];
-                if (insn == 0) {
-                    TCGArg t;
-                    t = a1, a1 = a2, a2 = t;
-                    cond = tcg_swap_cond(cond);
-                    insn = cmp_insn[cond];
-                    tcg_debug_assert(insn != 0);
+                if (is_scalar) {
+                    insn = cmp_scalar_insn[cond];
+                    if (insn == 0) {
+                        TCGArg t;
+                        t = a1, a1 = a2, a2 = t;
+                        cond = tcg_swap_cond(cond);
+                        insn = cmp_scalar_insn[cond];
+                        tcg_debug_assert(insn != 0);
+                    }
+                    tcg_out_insn_3611(s, insn, vece, a0, a1, a2);
+                } else {
+                    insn = cmp_vec_insn[cond];
+                    if (insn == 0) {
+                        TCGArg t;
+                        t = a1, a1 = a2, a2 = t;
+                        cond = tcg_swap_cond(cond);
+                        insn = cmp_vec_insn[cond];
+                        tcg_debug_assert(insn != 0);
+                    }
+                    tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2);
                 }
-                tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2);
             }
         }
         break;
-- 
2.25.1

Use the provided cpu_ldst.h interfaces.  This fixes the build vs
the unconverted uses of g2h(), adds missed memory trace events,
and correctly recognizes when a SIGSEGV belongs to the guest via
set_helper_retaddr().

Fixes: 3e8f1628e864
Tested-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci.c | 73 +++++++++++++++++++++----------------------------------
 1 file changed, 28 insertions(+), 45 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition)
     return result;
 }
 
-#ifdef CONFIG_SOFTMMU
-# define qemu_ld_ub \
-    helper_ret_ldub_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_ld_leuw \
-    helper_le_lduw_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_ld_leul \
-    helper_le_ldul_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_ld_leq \
-    helper_le_ldq_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_ld_beuw \
-    helper_be_lduw_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_ld_beul \
-    helper_be_ldul_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_ld_beq \
-    helper_be_ldq_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_st_b(X) \
-    helper_ret_stb_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-# define qemu_st_lew(X) \
-    helper_le_stw_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-# define qemu_st_lel(X) \
-    helper_le_stl_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-# define qemu_st_leq(X) \
-    helper_le_stq_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-# define qemu_st_bew(X) \
-    helper_be_stw_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-# define qemu_st_bel(X) \
-    helper_be_stl_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-# define qemu_st_beq(X) \
-    helper_be_stq_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-#else
-# define qemu_ld_ub      ldub_p(g2h(taddr))
-# define qemu_ld_leuw    lduw_le_p(g2h(taddr))
-# define qemu_ld_leul    (uint32_t)ldl_le_p(g2h(taddr))
-# define qemu_ld_leq     ldq_le_p(g2h(taddr))
-# define qemu_ld_beuw    lduw_be_p(g2h(taddr))
-# define qemu_ld_beul    (uint32_t)ldl_be_p(g2h(taddr))
-# define qemu_ld_beq     ldq_be_p(g2h(taddr))
-# define qemu_st_b(X)    stb_p(g2h(taddr), X)
-# define qemu_st_lew(X)  stw_le_p(g2h(taddr), X)
-# define qemu_st_lel(X)  stl_le_p(g2h(taddr), X)
-# define qemu_st_leq(X)  stq_le_p(g2h(taddr), X)
-# define qemu_st_bew(X)  stw_be_p(g2h(taddr), X)
-# define qemu_st_bel(X)  stl_be_p(g2h(taddr), X)
-# define qemu_st_beq(X)  stq_be_p(g2h(taddr), X)
-#endif
+#define qemu_ld_ub \
+    cpu_ldub_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_ld_leuw \
+    cpu_lduw_le_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_ld_leul \
+    cpu_ldl_le_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_ld_leq \
+    cpu_ldq_le_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_ld_beuw \
+    cpu_lduw_be_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_ld_beul \
+    cpu_ldl_be_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_ld_beq \
+    cpu_ldq_be_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_b(X) \
+    cpu_stb_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_lew(X) \
+    cpu_stw_le_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_lel(X) \
+    cpu_stl_le_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_leq(X) \
+    cpu_stq_le_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_bew(X) \
+    cpu_stw_be_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_bel(X) \
+    cpu_stl_be_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_beq(X) \
+    cpu_stq_be_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
 
 #if TCG_TARGET_REG_BITS == 64
 # define CASE_32_64(x) \
-- 
2.25.1

Allow other places in tcg to restart with a smaller tb.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
     s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
 }
 
+/* Signal overflow, starting over with fewer guest insns. */
+static void QEMU_NORETURN tcg_raise_tb_overflow(TCGContext *s)
+{
+    siglongjmp(s->jmp_trans, -2);
+}
+
 #define C_PFX1(P, A)                    P##A
 #define C_PFX2(P, A, B)                 P##A##_##B
 #define C_PFX3(P, A, B, C)              P##A##_##B##_##C
@@ -XXX,XX +XXX,XX @@ static TCGTemp *tcg_temp_alloc(TCGContext *s)
     int n = s->nb_temps++;
 
     if (n >= TCG_MAX_TEMPS) {
-        /* Signal overflow, starting over with fewer guest insns. */
-        siglongjmp(s->jmp_trans, -2);
+        tcg_raise_tb_overflow(s);
     }
     return memset(&s->temps[n], 0, sizeof(TCGTemp));
 }
-- 
2.25.1

The use in tcg_tb_lookup is given a random pc that comes from the pc
of a signal handler.  Do not assert that the pointer is already within
the code gen buffer at all, much less the writable mirror of it.

Fixes: db0c51a3803
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_region_trees_init(void)
     }
 }
 
-static struct tcg_region_tree *tc_ptr_to_region_tree(const void *cp)
+static struct tcg_region_tree *tc_ptr_to_region_tree(const void *p)
 {
-    void *p = tcg_splitwx_to_rw(cp);
     size_t region_idx;
 
+    /*
+     * Like tcg_splitwx_to_rw, with no assert.  The pc may come from
+     * a signal handler over which the caller has no control.
+     */
+    if (!in_code_gen_buffer(p)) {
+        p -= tcg_splitwx_diff;
+        if (!in_code_gen_buffer(p)) {
+            return NULL;
+        }
+    }
+
     if (p < region.start_aligned) {
         region_idx = 0;
     } else {
@@ -XXX,XX +XXX,XX @@ void tcg_tb_insert(TranslationBlock *tb)
 {
     struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
 
+    g_assert(rt != NULL);
     qemu_mutex_lock(&rt->lock);
     g_tree_insert(rt->tree, &tb->tc, tb);
     qemu_mutex_unlock(&rt->lock);
@@ -XXX,XX +XXX,XX @@ void tcg_tb_remove(TranslationBlock *tb)
 {
     struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
 
+    g_assert(rt != NULL);
     qemu_mutex_lock(&rt->lock);
     g_tree_remove(rt->tree, &tb->tc);
     qemu_mutex_unlock(&rt->lock);
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tcg_tb_lookup(uintptr_t tc_ptr)
     TranslationBlock *tb;
     struct tb_tc s = { .ptr = (void *)tc_ptr };
 
+    if (rt == NULL) {
+        return NULL;
+    }
+
     qemu_mutex_lock(&rt->lock);
     tb = g_tree_lookup(rt->tree, &s);
     qemu_mutex_unlock(&rt->lock);
-- 
2.25.1