Series comparison

-[PULL 0/3] tcg patch queue
+[PULL v2 00/15] tcg patch queue
-The following changes since commit 2ecfc0657afa5d29a373271b342f704a1a3c6737:
+Second try's the charm today, right?
-  Merge remote-tracking branch 'remotes/armbru/tags/pull-misc-2020-12-10' into staging (2020-12-10 17:01:05 +0000)
 r~
 The following changes since commit 00b1faea41d283e931256aa78aa975a369ec3ae6:
   Merge tag 'pull-target-arm-20230123' of https://git.linaro.org/people/pmaydell/qemu-arm into staging (2023-01-23 13:40:28 +0000)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20201210
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230123
-for you to fetch changes up to 9e2658d62ebc23efe7df43fc0e306f129510d874:
+for you to fetch changes up to 709bcd7da3f6b4655d910634a0d520fa1439df38:
-  accel/tcg: rename tcg-cpus functions to match module name (2020-12-10 17:44:10 -0600)
+  tcg/loongarch64: Reorg goto_tb implementation (2023-01-23 16:00:13 -1000)
 ----------------------------------------------------------------
-Split CpusAccel for tcg variants
+common-user: Re-enable ppc32 host
 tcg: Avoid recursion in tcg_gen_mulu2_i32
 tcg: Mark tcg helpers noinline to avoid an issue with LTO
 tcg/arm: Use register pair allocation for qemu_{ld,st}_i64
 disas: Enable loongarch disassembler, and fixes
 tcg/loongarch64: Improve move immediate
 tcg/loongarch64: Improve add immediate
 tcg/loongarch64: Improve setcond
 tcg/loongarch64: Implement movcond
 tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
 tcg/loongarch64: Reorg goto_tb implementation
 ----------------------------------------------------------------
-Claudio Fontana (3):
+Richard Henderson (14):
-      accel/tcg: split CpusAccel into three TCG variants
+      tcg: Avoid recursion in tcg_gen_mulu2_i32
-      accel/tcg: split tcg_start_vcpu_thread
+      tcg/arm: Use register pair allocation for qemu_{ld,st}_i64
-      accel/tcg: rename tcg-cpus functions to match module name
+      common-user/host/ppc: Implement safe-syscall.inc.S
       linux-user: Implment host/ppc/host-signal.h
       tcg: Mark tcg helpers noinline to avoid an issue with LTO
       target/loongarch: Enable the disassembler for host tcg
       target/loongarch: Disassemble jirl properly
       target/loongarch: Disassemble pcadd* addresses
       tcg/loongarch64: Update tcg-insn-defs.c.inc
       tcg/loongarch64: Introduce tcg_out_addi
       tcg/loongarch64: Improve setcond expansion
       tcg/loongarch64: Implement movcond
       tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
       tcg/loongarch64: Reorg goto_tb implementation
- accel/tcg/tcg-cpus-icount.h |  17 ++
+Rui Wang (1):
- accel/tcg/tcg-cpus-rr.h     |  21 ++
+      tcg/loongarch64: Optimize immediate loading
  accel/tcg/tcg-cpus.h        |  12 +-
  accel/tcg/tcg-all.c         |  13 +-
  accel/tcg/tcg-cpus-icount.c | 147 +++++++++++++
  accel/tcg/tcg-cpus-mttcg.c  | 140 ++++++++++++
  accel/tcg/tcg-cpus-rr.c     | 305 ++++++++++++++++++++++++++
  accel/tcg/tcg-cpus.c        | 506 +-------------------------------------------
  softmmu/icount.c            |   2 +-
  accel/tcg/meson.build       |   9 +-
 files changed, 670 insertions(+), 502 deletions(-)
  create mode 100644 accel/tcg/tcg-cpus-icount.h
  create mode 100644 accel/tcg/tcg-cpus-rr.h
  create mode 100644 accel/tcg/tcg-cpus-icount.c
  create mode 100644 accel/tcg/tcg-cpus-mttcg.c
  create mode 100644 accel/tcg/tcg-cpus-rr.c
+ include/exec/helper-proto.h                    |  32 ++-
+ include/tcg/tcg.h                              |   7 -
+ linux-user/include/host/ppc/host-signal.h      |  39 +++
+ tcg/arm/tcg-target-con-set.h                   |   7 +-
+ tcg/arm/tcg-target-con-str.h                   |   2 +
+ tcg/loongarch64/tcg-target-con-set.h           |   5 +-
+ tcg/loongarch64/tcg-target-con-str.h           |   2 +-
+ tcg/loongarch64/tcg-target.h                   |  11 +-
+ target/loongarch/insns.decode                  |   3 +-
+ disas.c                                        |   2 +
+ target/loongarch/disas.c                       |  39 ++-
+ tcg/tcg-op.c                                   |   4 +-
+ target/loongarch/insn_trans/trans_branch.c.inc |   2 +-
+ tcg/arm/tcg-target.c.inc                       |  28 +-
+ tcg/loongarch64/tcg-insn-defs.c.inc            |  10 +-
+ tcg/loongarch64/tcg-target.c.inc               | 364 ++++++++++++++++---------
+ common-user/host/ppc/safe-syscall.inc.S        | 107 ++++++++
+ target/loongarch/meson.build                   |   3 +-
+files changed, 497 insertions(+), 170 deletions(-)
+ create mode 100644 linux-user/include/host/ppc/host-signal.h
+ create mode 100644 common-user/host/ppc/safe-syscall.inc.S

-New patch
+[PULL v2 01/15] tcg: Avoid recursion in tcg_gen_mulu2_i32
+We have a test for one of TCG_TARGET_HAS_mulu2_i32 or
+TCG_TARGET_HAS_muluh_i32 being defined, but the test
+became non-functional when we changed to always define
+all of these macros.
+Replace this with a build-time test in tcg_gen_mulu2_i32.
+Fixes: 25c4d9cc845 ("tcg: Always define all of the TCGOpcode enum members.")
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1435
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/tcg/tcg.h | 7 -------
+ tcg/tcg-op.c      | 4 +++-
+files changed, 3 insertions(+), 8 deletions(-)
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/tcg/tcg.h
++++ b/include/tcg/tcg.h
+@@ -XXX,XX +XXX,XX @@ typedef uint64_t TCGRegSet;
+ #define TCG_TARGET_HAS_rem_i64          0
+ #endif
+-/* For 32-bit targets, some sort of unsigned widening multiply is required.  */
+-#if TCG_TARGET_REG_BITS == 32 \
+-    && !(defined(TCG_TARGET_HAS_mulu2_i32) \
+-         || defined(TCG_TARGET_HAS_muluh_i32))
+-# error "Missing unsigned widening multiply"
+-#endif
+-
+ #if !defined(TCG_TARGET_HAS_v64) \
+     && !defined(TCG_TARGET_HAS_v128) \
+     && !defined(TCG_TARGET_HAS_v256)
+diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg-op.c
++++ b/tcg/tcg-op.c
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
+         tcg_gen_op3_i32(INDEX_op_muluh_i32, rh, arg1, arg2);
+         tcg_gen_mov_i32(rl, t);
+         tcg_temp_free_i32(t);
+-    } else {
++    } else if (TCG_TARGET_REG_BITS == 64) {
+         TCGv_i64 t0 = tcg_temp_new_i64();
+         TCGv_i64 t1 = tcg_temp_new_i64();
+         tcg_gen_extu_i32_i64(t0, arg1);
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
+         tcg_gen_extr_i64_i32(rl, rh, t0);
+         tcg_temp_free_i64(t0);
+         tcg_temp_free_i64(t1);
++    } else {
++        qemu_build_not_reached();
+     }
+ }
+--
+.34.1

-New patch
+[PULL v2 02/15] tcg/arm: Use register pair allocation for qemu_{ld, st}_i64
+Although we still can't use ldrd and strd for all operations,
+increase the chances by getting the register allocation correct.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target-con-set.h |  7 ++++---
+ tcg/arm/tcg-target-con-str.h |  2 ++
+ tcg/arm/tcg-target.c.inc     | 28 ++++++++++++++++++----------
+files changed, 24 insertions(+), 13 deletions(-)
+diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target-con-set.h
++++ b/tcg/arm/tcg-target-con-set.h
+@@ -XXX,XX +XXX,XX @@ C_O0_I2(r, rIN)
+ C_O0_I2(s, s)
+ C_O0_I2(w, r)
+ C_O0_I3(s, s, s)
++C_O0_I3(S, p, s)
+ C_O0_I4(r, r, rI, rI)
+-C_O0_I4(s, s, s, s)
++C_O0_I4(S, p, s, s)
+ C_O1_I1(r, l)
+ C_O1_I1(r, r)
+ C_O1_I1(w, r)
+@@ -XXX,XX +XXX,XX @@ C_O1_I2(w, w, wZ)
+ C_O1_I3(w, w, w, w)
+ C_O1_I4(r, r, r, rI, rI)
+ C_O1_I4(r, r, rIN, rIK, 0)
+-C_O2_I1(r, r, l)
+-C_O2_I2(r, r, l, l)
++C_O2_I1(e, p, l)
++C_O2_I2(e, p, l, l)
+ C_O2_I2(r, r, r, r)
+ C_O2_I4(r, r, r, r, rIN, rIK)
+ C_O2_I4(r, r, rI, rI, rIN, rIK)
+diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target-con-str.h
++++ b/tcg/arm/tcg-target-con-str.h
+@@ -XXX,XX +XXX,XX @@
+  * Define constraint letters for register sets:
+  * REGS(letter, register_mask)
+  */
++REGS('e', ALL_GENERAL_REGS & 0x5555) /* even regs */
+ REGS('r', ALL_GENERAL_REGS)
+ REGS('l', ALL_QLOAD_REGS)
+ REGS('s', ALL_QSTORE_REGS)
++REGS('S', ALL_QSTORE_REGS & 0x5555)  /* even qstore */
+ REGS('w', ALL_VECTOR_REGS)
+ /*
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
+         tcg_out_ld32_r(s, COND_AL, datalo, addrlo, addend);
+         break;
+     case MO_UQ:
++        /* We used pair allocation for datalo, so already should be aligned. */
++        tcg_debug_assert((datalo & 1) == 0);
++        tcg_debug_assert(datahi == datalo + 1);
+         /* LDRD requires alignment; double-check that. */
+-        if (get_alignment_bits(opc) >= MO_64
+-            && (datalo & 1) == 0 && datahi == datalo + 1) {
++        if (get_alignment_bits(opc) >= MO_64) {
+             /*
+              * Rm (the second address op) must not overlap Rt or Rt + 1.
+              * Since datalo is aligned, we can simplify the test via alignment.
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
+         tcg_out_ld32_12(s, COND_AL, datalo, addrlo, 0);
+         break;
+     case MO_UQ:
++        /* We used pair allocation for datalo, so already should be aligned. */
++        tcg_debug_assert((datalo & 1) == 0);
++        tcg_debug_assert(datahi == datalo + 1);
+         /* LDRD requires alignment; double-check that. */
+-        if (get_alignment_bits(opc) >= MO_64
+-            && (datalo & 1) == 0 && datahi == datalo + 1) {
++        if (get_alignment_bits(opc) >= MO_64) {
+             tcg_out_ldrd_8(s, COND_AL, datalo, addrlo, 0);
+         } else if (datalo == addrlo) {
+             tcg_out_ld32_12(s, COND_AL, datahi, addrlo, 4);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
+         tcg_out_st32_r(s, cond, datalo, addrlo, addend);
+         break;
+     case MO_64:
++        /* We used pair allocation for datalo, so already should be aligned. */
++        tcg_debug_assert((datalo & 1) == 0);
++        tcg_debug_assert(datahi == datalo + 1);
+         /* STRD requires alignment; double-check that. */
+-        if (get_alignment_bits(opc) >= MO_64
+-            && (datalo & 1) == 0 && datahi == datalo + 1) {
++        if (get_alignment_bits(opc) >= MO_64) {
+             tcg_out_strd_r(s, cond, datalo, addrlo, addend);
+         } else if (scratch_addend) {
+             tcg_out_st32_rwb(s, cond, datalo, addend, addrlo);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg datalo,
+         tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
+         break;
+     case MO_64:
++        /* We used pair allocation for datalo, so already should be aligned. */
++        tcg_debug_assert((datalo & 1) == 0);
++        tcg_debug_assert(datahi == datalo + 1);
+         /* STRD requires alignment; double-check that. */
+-        if (get_alignment_bits(opc) >= MO_64
+-            && (datalo & 1) == 0 && datahi == datalo + 1) {
++        if (get_alignment_bits(opc) >= MO_64) {
+             tcg_out_strd_8(s, COND_AL, datalo, addrlo, 0);
+         } else {
+             tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_qemu_ld_i32:
+         return TARGET_LONG_BITS == 32 ? C_O1_I1(r, l) : C_O1_I2(r, l, l);
+     case INDEX_op_qemu_ld_i64:
+-        return TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, l) : C_O2_I2(r, r, l, l);
++        return TARGET_LONG_BITS == 32 ? C_O2_I1(e, p, l) : C_O2_I2(e, p, l, l);
+     case INDEX_op_qemu_st_i32:
+         return TARGET_LONG_BITS == 32 ? C_O0_I2(s, s) : C_O0_I3(s, s, s);
+     case INDEX_op_qemu_st_i64:
+-        return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
++        return TARGET_LONG_BITS == 32 ? C_O0_I3(S, p, s) : C_O0_I4(S, p, s, s);
+     case INDEX_op_st_vec:
+         return C_O0_I2(w, r);
+--
+.34.1

-New patch
+[PULL v2 03/15] common-user/host/ppc: Implement safe-syscall.inc.S
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
+Message-Id: <20220729172141.1789105-2-richard.henderson@linaro.org>
+---
+ common-user/host/ppc/safe-syscall.inc.S | 107 ++++++++++++++++++++++++
+file changed, 107 insertions(+)
+ create mode 100644 common-user/host/ppc/safe-syscall.inc.S
+diff --git a/common-user/host/ppc/safe-syscall.inc.S b/common-user/host/ppc/safe-syscall.inc.S
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/common-user/host/ppc/safe-syscall.inc.S
+@@ -XXX,XX +XXX,XX @@
++/*
++ * safe-syscall.inc.S : host-specific assembly fragment
++ * to handle signals occurring at the same time as system calls.
++ * This is intended to be included by common-user/safe-syscall.S
++ *
++ * Copyright (C) 2022 Linaro, Ltd.
++ *
++ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * See the COPYING file in the top-level directory.
++ */
++
++/*
++ * Standardize on the _CALL_FOO symbols used by GCC:
++ * Apple XCode does not define _CALL_DARWIN.
++ * Clang defines _CALL_ELF (64-bit) but not _CALL_SYSV (32-bit).
++ */
++#if !defined(_CALL_SYSV) && \
++    !defined(_CALL_DARWIN) && \
++    !defined(_CALL_AIX) && \
++    !defined(_CALL_ELF)
++# if defined(__APPLE__)
++#  define _CALL_DARWIN
++# elif defined(__ELF__) && TCG_TARGET_REG_BITS == 32
++#  define _CALL_SYSV
++# else
++#  error "Unknown ABI"
++# endif
++#endif
++
++#ifndef _CALL_SYSV
++# error "Unsupported ABI"
++#endif
++
++
++        .global safe_syscall_base
++        .global safe_syscall_start
++        .global safe_syscall_end
++        .type   safe_syscall_base, @function
++
++        .text
++
++        /*
++         * This is the entry point for making a system call. The calling
++         * convention here is that of a C varargs function with the
++         * first argument an 'int *' to the signal_pending flag, the
++         * second one the system call number (as a 'long'), and all further
++         * arguments being syscall arguments (also 'long').
++         */
++safe_syscall_base:
++        .cfi_startproc
++        stwu    1, -8(1)
++        .cfi_def_cfa_offset 8
++        stw     30, 4(1)
++        .cfi_offset 30, -4
++
++        /*
++         * We enter with r3 == &signal_pending
++         *               r4 == syscall number
++         *               r5 ... r10 == syscall arguments
++         *               and return the result in r3
++         * and the syscall instruction needs
++         *               r0 == syscall number
++         *               r3 ... r8 == syscall arguments
++         *               and returns the result in r3
++         * Shuffle everything around appropriately.
++         */
++        mr      30, 3           /* signal_pending */
++        mr      0, 4            /* syscall number */
++        mr      3, 5            /* syscall arguments */
++        mr      4, 6
++        mr      5, 7
++        mr      6, 8
++        mr      7, 9
++        mr      8, 10
++
++        /*
++         * This next sequence of code works in conjunction with the
++         * rewind_if_safe_syscall_function(). If a signal is taken
++         * and the interrupted PC is anywhere between 'safe_syscall_start'
++         * and 'safe_syscall_end' then we rewind it to 'safe_syscall_start'.
++         * The code sequence must therefore be able to cope with this, and
++         * the syscall instruction must be the final one in the sequence.
++         */
++safe_syscall_start:
++        /* if signal_pending is non-zero, don't do the call */
++        lwz     12, 0(30)
++        cmpwi   0, 12, 0
++        bne-    2f
++        sc
++safe_syscall_end:
++        /* code path when we did execute the syscall */
++        lwz     30, 4(1)        /* restore r30 */
++        addi    1, 1, 8         /* restore stack */
++        .cfi_restore 30
++        .cfi_def_cfa_offset 0
++        bnslr+                  /* return on success */
++        b       safe_syscall_set_errno_tail
++
++        /* code path when we didn't execute the syscall */
++2:      lwz     30, 4(1)
++        addi    1, 1, 8
++        addi    3, 0, QEMU_ERESTARTSYS
++        b       safe_syscall_set_errno_tail
++
++        .cfi_endproc
++
++        .size   safe_syscall_base, .-safe_syscall_base
+--
+.34.1

-[PULL 1/3] accel/tcg: split CpusAccel into three TCG variants
+[PULL v2 04/15] linux-user: Implment host/ppc/host-signal.h
-From: Claudio Fontana <cfontana@suse.de>
+This commit re-enables ppc32 as a linux-user host,
 as existance of the directory is noted by configure.
-split up the CpusAccel tcg_cpus into three TCG variants:
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1097
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
 Message-Id: <20220729172141.1789105-3-richard.henderson@linaro.org>
 ---
  linux-user/include/host/ppc/host-signal.h | 39 +++++++++++++++++++++++
 file changed, 39 insertions(+)
  create mode 100644 linux-user/include/host/ppc/host-signal.h
-tcg_cpus_rr (single threaded, round robin cpus)
+diff --git a/linux-user/include/host/ppc/host-signal.h b/linux-user/include/host/ppc/host-signal.h
 tcg_cpus_icount (same as rr, but with instruction counting enabled)
 tcg_cpus_mttcg (multi-threaded cpus)
 Suggested-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20201015143217.29337-2-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  accel/tcg/tcg-cpus-icount.h |  17 ++
  accel/tcg/tcg-cpus-mttcg.h  |  21 ++
  accel/tcg/tcg-cpus-rr.h     |  20 ++
  accel/tcg/tcg-cpus.h        |  13 +-
  accel/tcg/tcg-all.c         |   8 +-
  accel/tcg/tcg-cpus-icount.c | 147 +++++++++++
  accel/tcg/tcg-cpus-mttcg.c  | 117 +++++++++
  accel/tcg/tcg-cpus-rr.c     | 270 ++++++++++++++++++++
  accel/tcg/tcg-cpus.c        | 484 ++----------------------------------
  softmmu/icount.c            |   2 +-
  accel/tcg/meson.build       |   9 +-
 files changed, 646 insertions(+), 462 deletions(-)
  create mode 100644 accel/tcg/tcg-cpus-icount.h
  create mode 100644 accel/tcg/tcg-cpus-mttcg.h
  create mode 100644 accel/tcg/tcg-cpus-rr.h
  create mode 100644 accel/tcg/tcg-cpus-icount.c
  create mode 100644 accel/tcg/tcg-cpus-mttcg.c
  create mode 100644 accel/tcg/tcg-cpus-rr.c
 diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/accel/tcg/tcg-cpus-icount.h
++++ b/linux-user/include/host/ppc/host-signal.h
 @@ -XXX,XX +XXX,XX @@
 +/*
-+ * QEMU TCG Single Threaded vCPUs implementation using instruction counting
++ * host-signal.h: signal info dependent on the host architecture
 + *
-+ * Copyright 2020 SUSE LLC
++ * Copyright (c) 2022 Linaro Ltd.
 + *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
-+#ifndef TCG_CPUS_ICOUNT_H
++#ifndef PPC_HOST_SIGNAL_H
-+#define TCG_CPUS_ICOUNT_H
++#define PPC_HOST_SIGNAL_H
 +
-+void handle_icount_deadline(void);
++#include <asm/ptrace.h>
 +void prepare_icount_for_run(CPUState *cpu);
 +void process_icount_data(CPUState *cpu);
 +
-+#endif /* TCG_CPUS_ICOUNT_H */
++/* The third argument to a SA_SIGINFO handler is ucontext_t. */
-diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
++typedef ucontext_t host_sigcontext;
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-mttcg.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Multi Threaded vCPUs implementation
 + *
 + * Copyright 2020 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
-+#ifndef TCG_CPUS_MTTCG_H
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
 +#define TCG_CPUS_MTTCG_H
 +
 +/*
 + * In the multi-threaded case each vCPU has its own thread. The TLS
 + * variable current_cpu can be used deep in the code to find the
 + * current CPUState for a given thread.
 + */
 +
 +void *tcg_cpu_thread_fn(void *arg);
 +
 +#endif /* TCG_CPUS_MTTCG_H */
 diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation
 + *
 + * Copyright 2020 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef TCG_CPUS_RR_H
 +#define TCG_CPUS_RR_H
 +
 +#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 +
 +/* Kick all RR vCPUs. */
 +void qemu_cpu_kick_rr_cpus(CPUState *unused);
 +
 +void *tcg_rr_cpu_thread_fn(void *arg);
 +
 +#endif /* TCG_CPUS_RR_H */
 diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.h
 +++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@
  /*
 - * Accelerator CPUS Interface
 + * QEMU TCG vCPU common functionality
 + *
 + * Functionality common to all TCG vcpu variants: mttcg, rr and icount.
   *
   * Copyright 2020 SUSE LLC
   *
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/cpus.h"
 -extern const CpusAccel tcg_cpus;
 +extern const CpusAccel tcg_cpus_mttcg;
 +extern const CpusAccel tcg_cpus_icount;
 +extern const CpusAccel tcg_cpus_rr;
 +
 +void tcg_start_vcpu_thread(CPUState *cpu);
 +void qemu_tcg_destroy_vcpu(CPUState *cpu);
 +int tcg_cpu_exec(CPUState *cpu);
 +void tcg_handle_interrupt(CPUState *cpu, int mask);
  #endif /* TCG_CPUS_H */
 diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-all.c
 +++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
      tcg_exec_init(s->tb_size * 1024 * 1024);
      mttcg_enabled = s->mttcg_enabled;
 -    cpus_register_accel(&tcg_cpus);
 +    if (mttcg_enabled) {
 +        cpus_register_accel(&tcg_cpus_mttcg);
 +    } else if (icount_enabled()) {
 +        cpus_register_accel(&tcg_cpus_icount);
 +    } else {
 +        cpus_register_accel(&tcg_cpus_rr);
 +    }
      return 0;
  }
 diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation using instruction counting
 + *
 + * Copyright (c) 2003-2008 Fabrice Bellard
 + * Copyright (c) 2014 Red Hat Inc.
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu-common.h"
 +#include "sysemu/tcg.h"
 +#include "sysemu/replay.h"
 +#include "qemu/main-loop.h"
 +#include "qemu/guest-random.h"
 +#include "exec/exec-all.h"
 +#include "hw/boards.h"
 +
 +#include "tcg-cpus.h"
 +#include "tcg-cpus-icount.h"
 +#include "tcg-cpus-rr.h"
 +
 +static int64_t tcg_get_icount_limit(void)
 +{
-+    int64_t deadline;
++    return uc->uc_mcontext.regs->nip;
 +
 +    if (replay_mode != REPLAY_MODE_PLAY) {
 +        /*
 +         * Include all the timers, because they may need an attention.
 +         * Too long CPU execution may create unnecessary delay in UI.
 +         */
 +        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 +                                              QEMU_TIMER_ATTR_ALL);
 +        /* Check realtime timers, because they help with input processing */
 +        deadline = qemu_soonest_timeout(deadline,
 +                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
 +                                           QEMU_TIMER_ATTR_ALL));
 +
 +        /*
 +         * Maintain prior (possibly buggy) behaviour where if no deadline
 +         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
 +         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
 +         * nanoseconds.
 +         */
 +        if ((deadline < 0) || (deadline > INT32_MAX)) {
 +            deadline = INT32_MAX;
 +        }
 +
 +        return icount_round(deadline);
 +    } else {
 +        return replay_get_instructions();
 +    }
 +}
 +
-+static void notify_aio_contexts(void)
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
 +{
-+    /* Wake up other AioContexts.  */
++    uc->uc_mcontext.regs->nip = pc;
 +    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 +    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 +}
 +
-+void handle_icount_deadline(void)
++static inline void *host_signal_mask(host_sigcontext *uc)
 +{
-+    assert(qemu_in_vcpu_thread());
++    return &uc->uc_sigmask;
 +    int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 +                                                  QEMU_TIMER_ATTR_ALL);
 +
 +    if (deadline == 0) {
 +        notify_aio_contexts();
 +    }
 +}
 +
-+void prepare_icount_for_run(CPUState *cpu)
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
 +{
-+    int insns_left;
++    return uc->uc_mcontext.regs->trap != 0x400
-+
++        && (uc->uc_mcontext.regs->dsisr & 0x02000000);
 +    /*
 +     * These should always be cleared by process_icount_data after
 +     * each vCPU execution. However u16.high can be raised
 +     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
 +     */
 +    g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
 +    g_assert(cpu->icount_extra == 0);
 +
 +    cpu->icount_budget = tcg_get_icount_limit();
 +    insns_left = MIN(0xffff, cpu->icount_budget);
 +    cpu_neg(cpu)->icount_decr.u16.low = insns_left;
 +    cpu->icount_extra = cpu->icount_budget - insns_left;
 +
 +    replay_mutex_lock();
 +
 +    if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
 +        notify_aio_contexts();
 +    }
 +}
 +
-+void process_icount_data(CPUState *cpu)
-+{
-+    /* Account for executed instructions */
-+    icount_update(cpu);
-+
-+    /* Reset the counters */
-+    cpu_neg(cpu)->icount_decr.u16.low = 0;
-+    cpu->icount_extra = 0;
-+    cpu->icount_budget = 0;
-+
-+    replay_account_executed_instructions();
-+
-+    replay_mutex_unlock();
-+}
-+
-+static void icount_handle_interrupt(CPUState *cpu, int mask)
-+{
-+    int old_mask = cpu->interrupt_request;
-+
-+    tcg_handle_interrupt(cpu, mask);
-+    if (qemu_cpu_is_self(cpu) &&
-+        !cpu->can_do_io
-+        && (mask & ~old_mask) != 0) {
-+        cpu_abort(cpu, "Raised interrupt while not in I/O function");
-+    }
-+}
-+
-+const CpusAccel tcg_cpus_icount = {
-+    .create_vcpu_thread = tcg_start_vcpu_thread,
-+    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
-+
-+    .handle_interrupt = icount_handle_interrupt,
-+    .get_virtual_clock = icount_get,
-+    .get_elapsed_ticks = icount_get,
-+};
-diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
-new file mode 100644
-index XXXXXXX..XXXXXXX
---- /dev/null
-+++ b/accel/tcg/tcg-cpus-mttcg.c
-@@ -XXX,XX +XXX,XX @@
-+/*
-+ * QEMU TCG Multi Threaded vCPUs implementation
-+ *
-+ * Copyright (c) 2003-2008 Fabrice Bellard
-+ * Copyright (c) 2014 Red Hat Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a copy
-+ * of this software and associated documentation files (the "Software"), to deal
-+ * in the Software without restriction, including without limitation the rights
-+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-+ * copies of the Software, and to permit persons to whom the Software is
-+ * furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-+ * THE SOFTWARE.
-+ */
-+
-+#include "qemu/osdep.h"
-+#include "qemu-common.h"
-+#include "sysemu/tcg.h"
-+#include "sysemu/replay.h"
-+#include "qemu/main-loop.h"
-+#include "qemu/guest-random.h"
-+#include "exec/exec-all.h"
-+#include "hw/boards.h"
-+
-+#include "tcg-cpus.h"
-+#include "tcg-cpus-mttcg.h"
-+
-+/*
-+ * In the multi-threaded case each vCPU has its own thread. The TLS
-+ * variable current_cpu can be used deep in the code to find the
-+ * current CPUState for a given thread.
-+ */
-+
-+void *tcg_cpu_thread_fn(void *arg)
-+{
-+    CPUState *cpu = arg;
-+
-+    assert(tcg_enabled());
-+    g_assert(!icount_enabled());
-+
-+    rcu_register_thread();
-+    tcg_register_thread();
-+
-+    qemu_mutex_lock_iothread();
-+    qemu_thread_get_self(cpu->thread);
-+
-+    cpu->thread_id = qemu_get_thread_id();
-+    cpu->can_do_io = 1;
-+    current_cpu = cpu;
-+    cpu_thread_signal_created(cpu);
-+    qemu_guest_random_seed_thread_part2(cpu->random_seed);
-+
-+    /* process any pending work */
-+    cpu->exit_request = 1;
-+
-+    do {
-+        if (cpu_can_run(cpu)) {
-+            int r;
-+            qemu_mutex_unlock_iothread();
-+            r = tcg_cpu_exec(cpu);
-+            qemu_mutex_lock_iothread();
-+            switch (r) {
-+            case EXCP_DEBUG:
-+                cpu_handle_guest_debug(cpu);
-+                break;
-+            case EXCP_HALTED:
-+                /*
-+                 * during start-up the vCPU is reset and the thread is
-+                 * kicked several times. If we don't ensure we go back
-+                 * to sleep in the halted state we won't cleanly
-+                 * start-up when the vCPU is enabled.
-+                 *
-+                 * cpu->halted should ensure we sleep in wait_io_event
-+                 */
-+                g_assert(cpu->halted);
-+                break;
-+            case EXCP_ATOMIC:
-+                qemu_mutex_unlock_iothread();
-+                cpu_exec_step_atomic(cpu);
-+                qemu_mutex_lock_iothread();
-+            default:
-+                /* Ignore everything else? */
-+                break;
-+            }
-+        }
-+
-+        qatomic_mb_set(&cpu->exit_request, 0);
-+        qemu_wait_io_event(cpu);
-+    } while (!cpu->unplug || cpu_can_run(cpu));
-+
-+    qemu_tcg_destroy_vcpu(cpu);
-+    qemu_mutex_unlock_iothread();
-+    rcu_unregister_thread();
-+    return NULL;
-+}
-+
-+static void mttcg_kick_vcpu_thread(CPUState *cpu)
-+{
-+    cpu_exit(cpu);
-+}
-+
-+const CpusAccel tcg_cpus_mttcg = {
-+    .create_vcpu_thread = tcg_start_vcpu_thread,
-+    .kick_vcpu_thread = mttcg_kick_vcpu_thread,
-+
-+    .handle_interrupt = tcg_handle_interrupt,
-+};
-diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
-new file mode 100644
-index XXXXXXX..XXXXXXX
---- /dev/null
-+++ b/accel/tcg/tcg-cpus-rr.c
-@@ -XXX,XX +XXX,XX @@
-+/*
-+ * QEMU TCG Single Threaded vCPUs implementation
-+ *
-+ * Copyright (c) 2003-2008 Fabrice Bellard
-+ * Copyright (c) 2014 Red Hat Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a copy
-+ * of this software and associated documentation files (the "Software"), to deal
-+ * in the Software without restriction, including without limitation the rights
-+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-+ * copies of the Software, and to permit persons to whom the Software is
-+ * furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-+ * THE SOFTWARE.
-+ */
-+
-+#include "qemu/osdep.h"
-+#include "qemu-common.h"
-+#include "sysemu/tcg.h"
-+#include "sysemu/replay.h"
-+#include "qemu/main-loop.h"
-+#include "qemu/guest-random.h"
-+#include "exec/exec-all.h"
-+#include "hw/boards.h"
-+
-+#include "tcg-cpus.h"
-+#include "tcg-cpus-rr.h"
-+#include "tcg-cpus-icount.h"
-+
-+/* Kick all RR vCPUs */
-+void qemu_cpu_kick_rr_cpus(CPUState *unused)
-+{
-+    CPUState *cpu;
-+
-+    CPU_FOREACH(cpu) {
-+        cpu_exit(cpu);
-+    };
-+}
-+
-+/*
-+ * TCG vCPU kick timer
-+ *
-+ * The kick timer is responsible for moving single threaded vCPU
-+ * emulation on to the next vCPU. If more than one vCPU is running a
-+ * timer event with force a cpu->exit so the next vCPU can get
-+ * scheduled.
-+ *
-+ * The timer is removed if all vCPUs are idle and restarted again once
-+ * idleness is complete.
-+ */
-+
-+static QEMUTimer *tcg_kick_vcpu_timer;
-+static CPUState *tcg_current_rr_cpu;
-+
-+#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
-+
-+static inline int64_t qemu_tcg_next_kick(void)
-+{
-+    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
-+}
-+
-+/* Kick the currently round-robin scheduled vCPU to next */
-+static void qemu_cpu_kick_rr_next_cpu(void)
-+{
-+    CPUState *cpu;
-+    do {
-+        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
-+        if (cpu) {
-+            cpu_exit(cpu);
-+        }
-+    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
-+}
-+
-+static void kick_tcg_thread(void *opaque)
-+{
-+    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-+    qemu_cpu_kick_rr_next_cpu();
-+}
-+
-+static void start_tcg_kick_timer(void)
-+{
-+    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
-+        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
-+                                           kick_tcg_thread, NULL);
-+    }
-+    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
-+        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-+    }
-+}
-+
-+static void stop_tcg_kick_timer(void)
-+{
-+    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
-+        timer_del(tcg_kick_vcpu_timer);
-+    }
-+}
-+
-+static void qemu_tcg_rr_wait_io_event(void)
-+{
-+    CPUState *cpu;
-+
-+    while (all_cpu_threads_idle()) {
-+        stop_tcg_kick_timer();
-+        qemu_cond_wait_iothread(first_cpu->halt_cond);
-+    }
-+
-+    start_tcg_kick_timer();
-+
-+    CPU_FOREACH(cpu) {
-+        qemu_wait_io_event_common(cpu);
-+    }
-+}
-+
-+/*
-+ * Destroy any remaining vCPUs which have been unplugged and have
-+ * finished running
-+ */
-+static void deal_with_unplugged_cpus(void)
-+{
-+    CPUState *cpu;
-+
-+    CPU_FOREACH(cpu) {
-+        if (cpu->unplug && !cpu_can_run(cpu)) {
-+            qemu_tcg_destroy_vcpu(cpu);
-+            break;
-+        }
-+    }
-+}
-+
-+/*
-+ * In the single-threaded case each vCPU is simulated in turn. If
-+ * there is more than a single vCPU we create a simple timer to kick
-+ * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
-+ * This is done explicitly rather than relying on side-effects
-+ * elsewhere.
-+ */
-+
-+void *tcg_rr_cpu_thread_fn(void *arg)
-+{
-+    CPUState *cpu = arg;
-+
-+    assert(tcg_enabled());
-+    rcu_register_thread();
-+    tcg_register_thread();
-+
-+    qemu_mutex_lock_iothread();
-+    qemu_thread_get_self(cpu->thread);
-+
-+    cpu->thread_id = qemu_get_thread_id();
-+    cpu->can_do_io = 1;
-+    cpu_thread_signal_created(cpu);
-+    qemu_guest_random_seed_thread_part2(cpu->random_seed);
-+
-+    /* wait for initial kick-off after machine start */
-+    while (first_cpu->stopped) {
-+        qemu_cond_wait_iothread(first_cpu->halt_cond);
-+
-+        /* process any pending work */
-+        CPU_FOREACH(cpu) {
-+            current_cpu = cpu;
-+            qemu_wait_io_event_common(cpu);
-+        }
-+    }
-+
-+    start_tcg_kick_timer();
-+
-+    cpu = first_cpu;
-+
-+    /* process any pending work */
-+    cpu->exit_request = 1;
-+
-+    while (1) {
-+        qemu_mutex_unlock_iothread();
-+        replay_mutex_lock();
-+        qemu_mutex_lock_iothread();
-+
-+        if (icount_enabled()) {
-+            /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
-+            icount_account_warp_timer();
-+            /*
-+             * Run the timers here.  This is much more efficient than
-+             * waking up the I/O thread and waiting for completion.
-+             */
-+            handle_icount_deadline();
-+        }
-+
-+        replay_mutex_unlock();
-+
-+        if (!cpu) {
-+            cpu = first_cpu;
-+        }
-+
-+        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
-+
-+            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
-+            current_cpu = cpu;
-+
-+            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
-+                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
-+
-+            if (cpu_can_run(cpu)) {
-+                int r;
-+
-+                qemu_mutex_unlock_iothread();
-+                if (icount_enabled()) {
-+                    prepare_icount_for_run(cpu);
-+                }
-+                r = tcg_cpu_exec(cpu);
-+                if (icount_enabled()) {
-+                    process_icount_data(cpu);
-+                }
-+                qemu_mutex_lock_iothread();
-+
-+                if (r == EXCP_DEBUG) {
-+                    cpu_handle_guest_debug(cpu);
-+                    break;
-+                } else if (r == EXCP_ATOMIC) {
-+                    qemu_mutex_unlock_iothread();
-+                    cpu_exec_step_atomic(cpu);
-+                    qemu_mutex_lock_iothread();
-+                    break;
-+                }
-+            } else if (cpu->stop) {
-+                if (cpu->unplug) {
-+                    cpu = CPU_NEXT(cpu);
-+                }
-+                break;
-+            }
-+
-+            cpu = CPU_NEXT(cpu);
-+        } /* while (cpu && !cpu->exit_request).. */
-+
-+        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
-+        qatomic_set(&tcg_current_rr_cpu, NULL);
-+
-+        if (cpu && cpu->exit_request) {
-+            qatomic_mb_set(&cpu->exit_request, 0);
-+        }
-+
-+        if (icount_enabled() && all_cpu_threads_idle()) {
-+            /*
-+             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
-+             * in the main_loop, wake it up in order to start the warp timer.
-+             */
-+            qemu_notify_event();
-+        }
-+
-+        qemu_tcg_rr_wait_io_event();
-+        deal_with_unplugged_cpus();
-+    }
-+
-+    rcu_unregister_thread();
-+    return NULL;
-+}
-+
-+const CpusAccel tcg_cpus_rr = {
-+    .create_vcpu_thread = tcg_start_vcpu_thread,
-+    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
-+
-+    .handle_interrupt = tcg_handle_interrupt,
-+};
-diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-cpus.c
-+++ b/accel/tcg/tcg-cpus.c
-@@ -XXX,XX +XXX,XX @@
- /*
-- * QEMU System Emulator
-+ * QEMU TCG vCPU common functionality
-+ *
-+ * Functionality common to all TCG vCPU variants: mttcg, rr and icount.
-  *
-  * Copyright (c) 2003-2008 Fabrice Bellard
-  * Copyright (c) 2014 Red Hat Inc.
-@@ -XXX,XX +XXX,XX @@
- #include "hw/boards.h"
- #include "tcg-cpus.h"
-+#include "tcg-cpus-mttcg.h"
-+#include "tcg-cpus-rr.h"
--/* Kick all RR vCPUs */
--static void qemu_cpu_kick_rr_cpus(void)
--{
--    CPUState *cpu;
-+/* common functionality among all TCG variants */
--    CPU_FOREACH(cpu) {
--        cpu_exit(cpu);
--    };
--}
--
--static void tcg_kick_vcpu_thread(CPUState *cpu)
--{
--    if (qemu_tcg_mttcg_enabled()) {
--        cpu_exit(cpu);
--    } else {
--        qemu_cpu_kick_rr_cpus();
--    }
--}
--
--/*
-- * TCG vCPU kick timer
-- *
-- * The kick timer is responsible for moving single threaded vCPU
-- * emulation on to the next vCPU. If more than one vCPU is running a
-- * timer event with force a cpu->exit so the next vCPU can get
-- * scheduled.
-- *
-- * The timer is removed if all vCPUs are idle and restarted again once
-- * idleness is complete.
-- */
--
--static QEMUTimer *tcg_kick_vcpu_timer;
--static CPUState *tcg_current_rr_cpu;
--
--#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
--
--static inline int64_t qemu_tcg_next_kick(void)
--{
--    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
--}
--
--/* Kick the currently round-robin scheduled vCPU to next */
--static void qemu_cpu_kick_rr_next_cpu(void)
--{
--    CPUState *cpu;
--    do {
--        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
--        if (cpu) {
--            cpu_exit(cpu);
--        }
--    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
--}
--
--static void kick_tcg_thread(void *opaque)
--{
--    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
--    qemu_cpu_kick_rr_next_cpu();
--}
--
--static void start_tcg_kick_timer(void)
--{
--    assert(!mttcg_enabled);
--    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
--        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
--                                           kick_tcg_thread, NULL);
--    }
--    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
--        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
--    }
--}
--
--static void stop_tcg_kick_timer(void)
--{
--    assert(!mttcg_enabled);
--    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
--        timer_del(tcg_kick_vcpu_timer);
--    }
--}
--
--static void qemu_tcg_destroy_vcpu(CPUState *cpu)
--{
--}
--
--static void qemu_tcg_rr_wait_io_event(void)
--{
--    CPUState *cpu;
--
--    while (all_cpu_threads_idle()) {
--        stop_tcg_kick_timer();
--        qemu_cond_wait_iothread(first_cpu->halt_cond);
--    }
--
--    start_tcg_kick_timer();
--
--    CPU_FOREACH(cpu) {
--        qemu_wait_io_event_common(cpu);
--    }
--}
--
--static int64_t tcg_get_icount_limit(void)
--{
--    int64_t deadline;
--
--    if (replay_mode != REPLAY_MODE_PLAY) {
--        /*
--         * Include all the timers, because they may need an attention.
--         * Too long CPU execution may create unnecessary delay in UI.
--         */
--        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
--                                              QEMU_TIMER_ATTR_ALL);
--        /* Check realtime timers, because they help with input processing */
--        deadline = qemu_soonest_timeout(deadline,
--                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
--                                           QEMU_TIMER_ATTR_ALL));
--
--        /*
--         * Maintain prior (possibly buggy) behaviour where if no deadline
--         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
--         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
--         * nanoseconds.
--         */
--        if ((deadline < 0) || (deadline > INT32_MAX)) {
--            deadline = INT32_MAX;
--        }
--
--        return icount_round(deadline);
--    } else {
--        return replay_get_instructions();
--    }
--}
--
--static void notify_aio_contexts(void)
--{
--    /* Wake up other AioContexts.  */
--    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
--    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
--}
--
--static void handle_icount_deadline(void)
--{
--    assert(qemu_in_vcpu_thread());
--    if (icount_enabled()) {
--        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
--                                                      QEMU_TIMER_ATTR_ALL);
--
--        if (deadline == 0) {
--            notify_aio_contexts();
--        }
--    }
--}
--
--static void prepare_icount_for_run(CPUState *cpu)
--{
--    if (icount_enabled()) {
--        int insns_left;
--
--        /*
--         * These should always be cleared by process_icount_data after
--         * each vCPU execution. However u16.high can be raised
--         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
--         */
--        g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
--        g_assert(cpu->icount_extra == 0);
--
--        cpu->icount_budget = tcg_get_icount_limit();
--        insns_left = MIN(0xffff, cpu->icount_budget);
--        cpu_neg(cpu)->icount_decr.u16.low = insns_left;
--        cpu->icount_extra = cpu->icount_budget - insns_left;
--
--        replay_mutex_lock();
--
--        if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
--            notify_aio_contexts();
--        }
--    }
--}
--
--static void process_icount_data(CPUState *cpu)
--{
--    if (icount_enabled()) {
--        /* Account for executed instructions */
--        icount_update(cpu);
--
--        /* Reset the counters */
--        cpu_neg(cpu)->icount_decr.u16.low = 0;
--        cpu->icount_extra = 0;
--        cpu->icount_budget = 0;
--
--        replay_account_executed_instructions();
--
--        replay_mutex_unlock();
--    }
--}
--
--static int tcg_cpu_exec(CPUState *cpu)
--{
--    int ret;
--#ifdef CONFIG_PROFILER
--    int64_t ti;
--#endif
--
--    assert(tcg_enabled());
--#ifdef CONFIG_PROFILER
--    ti = profile_getclock();
--#endif
--    cpu_exec_start(cpu);
--    ret = cpu_exec(cpu);
--    cpu_exec_end(cpu);
--#ifdef CONFIG_PROFILER
--    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
--                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
--#endif
--    return ret;
--}
--
--/*
-- * Destroy any remaining vCPUs which have been unplugged and have
-- * finished running
-- */
--static void deal_with_unplugged_cpus(void)
--{
--    CPUState *cpu;
--
--    CPU_FOREACH(cpu) {
--        if (cpu->unplug && !cpu_can_run(cpu)) {
--            qemu_tcg_destroy_vcpu(cpu);
--            cpu_thread_signal_destroyed(cpu);
--            break;
--        }
--    }
--}
--
--/*
-- * Single-threaded TCG
-- *
-- * In the single-threaded case each vCPU is simulated in turn. If
-- * there is more than a single vCPU we create a simple timer to kick
-- * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
-- * This is done explicitly rather than relying on side-effects
-- * elsewhere.
-- */
--
--static void *tcg_rr_cpu_thread_fn(void *arg)
--{
--    CPUState *cpu = arg;
--
--    assert(tcg_enabled());
--    rcu_register_thread();
--    tcg_register_thread();
--
--    qemu_mutex_lock_iothread();
--    qemu_thread_get_self(cpu->thread);
--
--    cpu->thread_id = qemu_get_thread_id();
--    cpu->can_do_io = 1;
--    cpu_thread_signal_created(cpu);
--    qemu_guest_random_seed_thread_part2(cpu->random_seed);
--
--    /* wait for initial kick-off after machine start */
--    while (first_cpu->stopped) {
--        qemu_cond_wait_iothread(first_cpu->halt_cond);
--
--        /* process any pending work */
--        CPU_FOREACH(cpu) {
--            current_cpu = cpu;
--            qemu_wait_io_event_common(cpu);
--        }
--    }
--
--    start_tcg_kick_timer();
--
--    cpu = first_cpu;
--
--    /* process any pending work */
--    cpu->exit_request = 1;
--
--    while (1) {
--        qemu_mutex_unlock_iothread();
--        replay_mutex_lock();
--        qemu_mutex_lock_iothread();
--        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
--        icount_account_warp_timer();
--
--        /*
--         * Run the timers here.  This is much more efficient than
--         * waking up the I/O thread and waiting for completion.
--         */
--        handle_icount_deadline();
--
--        replay_mutex_unlock();
--
--        if (!cpu) {
--            cpu = first_cpu;
--        }
--
--        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
--
--            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
--            current_cpu = cpu;
--
--            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
--                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
--
--            if (cpu_can_run(cpu)) {
--                int r;
--
--                qemu_mutex_unlock_iothread();
--                prepare_icount_for_run(cpu);
--
--                r = tcg_cpu_exec(cpu);
--
--                process_icount_data(cpu);
--                qemu_mutex_lock_iothread();
--
--                if (r == EXCP_DEBUG) {
--                    cpu_handle_guest_debug(cpu);
--                    break;
--                } else if (r == EXCP_ATOMIC) {
--                    qemu_mutex_unlock_iothread();
--                    cpu_exec_step_atomic(cpu);
--                    qemu_mutex_lock_iothread();
--                    break;
--                }
--            } else if (cpu->stop) {
--                if (cpu->unplug) {
--                    cpu = CPU_NEXT(cpu);
--                }
--                break;
--            }
--
--            cpu = CPU_NEXT(cpu);
--        } /* while (cpu && !cpu->exit_request).. */
--
--        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
--        qatomic_set(&tcg_current_rr_cpu, NULL);
--
--        if (cpu && cpu->exit_request) {
--            qatomic_mb_set(&cpu->exit_request, 0);
--        }
--
--        if (icount_enabled() && all_cpu_threads_idle()) {
--            /*
--             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
--             * in the main_loop, wake it up in order to start the warp timer.
--             */
--            qemu_notify_event();
--        }
--
--        qemu_tcg_rr_wait_io_event();
--        deal_with_unplugged_cpus();
--    }
--
--    rcu_unregister_thread();
--    return NULL;
--}
--
--/*
-- * Multi-threaded TCG
-- *
-- * In the multi-threaded case each vCPU has its own thread. The TLS
-- * variable current_cpu can be used deep in the code to find the
-- * current CPUState for a given thread.
-- */
--
--static void *tcg_cpu_thread_fn(void *arg)
--{
--    CPUState *cpu = arg;
--
--    assert(tcg_enabled());
--    g_assert(!icount_enabled());
--
--    rcu_register_thread();
--    tcg_register_thread();
--
--    qemu_mutex_lock_iothread();
--    qemu_thread_get_self(cpu->thread);
--
--    cpu->thread_id = qemu_get_thread_id();
--    cpu->can_do_io = 1;
--    current_cpu = cpu;
--    cpu_thread_signal_created(cpu);
--    qemu_guest_random_seed_thread_part2(cpu->random_seed);
--
--    /* process any pending work */
--    cpu->exit_request = 1;
--
--    do {
--        if (cpu_can_run(cpu)) {
--            int r;
--            qemu_mutex_unlock_iothread();
--            r = tcg_cpu_exec(cpu);
--            qemu_mutex_lock_iothread();
--            switch (r) {
--            case EXCP_DEBUG:
--                cpu_handle_guest_debug(cpu);
--                break;
--            case EXCP_HALTED:
--                /*
--                 * during start-up the vCPU is reset and the thread is
--                 * kicked several times. If we don't ensure we go back
--                 * to sleep in the halted state we won't cleanly
--                 * start-up when the vCPU is enabled.
--                 *
--                 * cpu->halted should ensure we sleep in wait_io_event
--                 */
--                g_assert(cpu->halted);
--                break;
--            case EXCP_ATOMIC:
--                qemu_mutex_unlock_iothread();
--                cpu_exec_step_atomic(cpu);
--                qemu_mutex_lock_iothread();
--            default:
--                /* Ignore everything else? */
--                break;
--            }
--        }
--
--        qatomic_mb_set(&cpu->exit_request, 0);
--        qemu_wait_io_event(cpu);
--    } while (!cpu->unplug || cpu_can_run(cpu));
--
--    qemu_tcg_destroy_vcpu(cpu);
--    cpu_thread_signal_destroyed(cpu);
--    qemu_mutex_unlock_iothread();
--    rcu_unregister_thread();
--    return NULL;
--}
--
--static void tcg_start_vcpu_thread(CPUState *cpu)
-+void tcg_start_vcpu_thread(CPUState *cpu)
- {
-     char thread_name[VCPU_THREAD_NAME_SIZE];
-     static QemuCond *single_tcg_halt_cond;
-@@ -XXX,XX +XXX,XX @@ static void tcg_start_vcpu_thread(CPUState *cpu)
-     }
- }
--static int64_t tcg_get_virtual_clock(void)
-+void qemu_tcg_destroy_vcpu(CPUState *cpu)
- {
--    if (icount_enabled()) {
--        return icount_get();
--    }
--    return cpu_get_clock();
-+    cpu_thread_signal_destroyed(cpu);
- }
--static int64_t tcg_get_elapsed_ticks(void)
-+int tcg_cpu_exec(CPUState *cpu)
- {
--    if (icount_enabled()) {
--        return icount_get();
--    }
--    return cpu_get_ticks();
-+    int ret;
-+#ifdef CONFIG_PROFILER
-+    int64_t ti;
 +#endif
-+    assert(tcg_enabled());
-+#ifdef CONFIG_PROFILER
-+    ti = profile_getclock();
-+#endif
-+    cpu_exec_start(cpu);
-+    ret = cpu_exec(cpu);
-+    cpu_exec_end(cpu);
-+#ifdef CONFIG_PROFILER
-+    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
-+                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
-+#endif
-+    return ret;
- }
- /* mask must never be zero, except for A20 change call */
--static void tcg_handle_interrupt(CPUState *cpu, int mask)
-+void tcg_handle_interrupt(CPUState *cpu, int mask)
- {
--    int old_mask;
-     g_assert(qemu_mutex_iothread_locked());
--    old_mask = cpu->interrupt_request;
-     cpu->interrupt_request |= mask;
-     /*
-@@ -XXX,XX +XXX,XX @@ static void tcg_handle_interrupt(CPUState *cpu, int mask)
-         qemu_cpu_kick(cpu);
-     } else {
-         qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
--        if (icount_enabled() &&
--            !cpu->can_do_io
--            && (mask & ~old_mask) != 0) {
--            cpu_abort(cpu, "Raised interrupt while not in I/O function");
--        }
-     }
- }
--
--const CpusAccel tcg_cpus = {
--    .create_vcpu_thread = tcg_start_vcpu_thread,
--    .kick_vcpu_thread = tcg_kick_vcpu_thread,
--
--    .handle_interrupt = tcg_handle_interrupt,
--
--    .get_virtual_clock = tcg_get_virtual_clock,
--    .get_elapsed_ticks = tcg_get_elapsed_ticks,
--};
-diff --git a/softmmu/icount.c b/softmmu/icount.c
-index XXXXXXX..XXXXXXX 100644
---- a/softmmu/icount.c
-+++ b/softmmu/icount.c
-@@ -XXX,XX +XXX,XX @@ void icount_start_warp_timer(void)
- void icount_account_warp_timer(void)
- {
--    if (!icount_enabled() || !icount_sleep) {
-+    if (!icount_sleep) {
-         return;
-     }
-diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/meson.build
-+++ b/accel/tcg/meson.build
-@@ -XXX,XX +XXX,XX @@ tcg_ss.add(when: 'CONFIG_SOFTMMU', if_false: files('user-exec-stub.c'))
- tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c'), libdl])
- specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
--specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files('tcg-all.c', 'cputlb.c', 'tcg-cpus.c'))
-+specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
-+  'tcg-all.c',
-+  'cputlb.c',
-+  'tcg-cpus.c',
-+  'tcg-cpus-mttcg.c',
-+  'tcg-cpus-icount.c',
-+  'tcg-cpus-rr.c'
-+))
 --
-.25.1
+.34.1

-New patch
+[PULL v2 05/15] tcg: Mark tcg helpers noinline to avoid an issue with LTO
+Marking helpers __attribute__((noinline)) prevents an issue
+with GCC's ipa-split pass under --enable-lto.
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1454
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Tested-by: Idan Horowitz <idan.horowitz@gmail.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/exec/helper-proto.h | 32 ++++++++++++++++++++++++--------
+file changed, 24 insertions(+), 8 deletions(-)
+diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/exec/helper-proto.h
++++ b/include/exec/helper-proto.h
+@@ -XXX,XX +XXX,XX @@
+ #include "exec/helper-head.h"
++/*
++ * Work around an issue with --enable-lto, in which GCC's ipa-split pass
++ * decides to split out the noreturn code paths that raise an exception,
++ * taking the __builtin_return_address() along into the new function,
++ * where it no longer computes a value that returns to TCG generated code.
++ * Despite the name, the noinline attribute affects splitter, so this
++ * prevents the optimization in question.  Given that helpers should not
++ * otherwise be called directly, this should have any other visible effect.
++ *
++ * See https://gitlab.com/qemu-project/qemu/-/issues/1454
++ */
++#define DEF_HELPER_ATTR  __attribute__((noinline))
++
+ #define DEF_HELPER_FLAGS_0(name, flags, ret) \
+-dh_ctype(ret) HELPER(name) (void);
++dh_ctype(ret) HELPER(name) (void) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_1(name, flags, ret, t1) \
+-dh_ctype(ret) HELPER(name) (dh_ctype(t1));
++dh_ctype(ret) HELPER(name) (dh_ctype(t1)) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
+-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2));
++dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2)) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) \
+-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3));
++dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), \
++                            dh_ctype(t3)) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) \
+ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+-                                   dh_ctype(t4));
++                            dh_ctype(t4)) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5) \
+ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+-                            dh_ctype(t4), dh_ctype(t5));
++                            dh_ctype(t4), dh_ctype(t5)) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6) \
+ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+-                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6));
++                            dh_ctype(t4), dh_ctype(t5), \
++                            dh_ctype(t6)) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
+ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+                             dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
+-                            dh_ctype(t7));
++                            dh_ctype(t7)) DEF_HELPER_ATTR;
+ #define IN_HELPER_PROTO
+@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+ #undef DEF_HELPER_FLAGS_5
+ #undef DEF_HELPER_FLAGS_6
+ #undef DEF_HELPER_FLAGS_7
++#undef DEF_HELPER_ATTR
+ #endif /* HELPER_PROTO_H */
+--
+.34.1

-New patch
+[PULL v2 06/15] target/loongarch: Enable the disassembler for host tcg
+Reuse the decodetree based disassembler from
+target/loongarch/ for tcg/loongarch64/.
+The generation of decode-insns.c.inc into ./libcommon.fa.p/ could
+eventually result in conflict, if any other host requires the same
+trick, but this is good enough for now.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ disas.c                      | 2 ++
+ target/loongarch/meson.build | 3 ++-
+files changed, 4 insertions(+), 1 deletion(-)
+diff --git a/disas.c b/disas.c
+index XXXXXXX..XXXXXXX 100644
+--- a/disas.c
++++ b/disas.c
+@@ -XXX,XX +XXX,XX @@ static void initialize_debug_host(CPUDebug *s)
+     s->info.cap_insn_split = 6;
+ #elif defined(__hppa__)
+     s->info.print_insn = print_insn_hppa;
++#elif defined(__loongarch__)
++    s->info.print_insn = print_insn_loongarch;
+ #endif
+ }
+diff --git a/target/loongarch/meson.build b/target/loongarch/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/meson.build
++++ b/target/loongarch/meson.build
+@@ -XXX,XX +XXX,XX @@ gen = decodetree.process('insns.decode')
+ loongarch_ss = ss.source_set()
+ loongarch_ss.add(files(
+   'cpu.c',
+-  'disas.c',
+ ))
+ loongarch_tcg_ss = ss.source_set()
+ loongarch_tcg_ss.add(gen)
+@@ -XXX,XX +XXX,XX @@ loongarch_softmmu_ss.add(files(
+   'iocsr_helper.c',
+ ))
++common_ss.add(when: 'CONFIG_LOONGARCH_DIS', if_true: [files('disas.c'), gen])
++
+ loongarch_ss.add_all(when: 'CONFIG_TCG', if_true: [loongarch_tcg_ss])
+ target_arch += {'loongarch': loongarch_ss}
+--
+.34.1

-New patch
+[PULL v2 07/15] target/loongarch: Disassemble jirl properly
+While jirl shares the same instruction format as bne etc,
+it is not assembled the same.  In particular, rd is printed
+first not second and the immediate is not pc-relative.
+Decode into the arg_rr_i structure, which prints correctly.
+This changes the "offs" member to "imm", to update translate.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/loongarch/insns.decode                  | 3 ++-
+ target/loongarch/disas.c                       | 2 +-
+ target/loongarch/insn_trans/trans_branch.c.inc | 2 +-
+files changed, 4 insertions(+), 3 deletions(-)
+diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/insns.decode
++++ b/target/loongarch/insns.decode
+@@ -XXX,XX +XXX,XX @@
+ @rr_ui12                 .... ...... imm:12 rj:5 rd:5    &rr_i
+ @rr_i14s2         .... ....  .............. rj:5 rd:5    &rr_i imm=%i14s2
+ @rr_i16                     .... .. imm:s16 rj:5 rd:5    &rr_i
++@rr_i16s2         .... ..  ................ rj:5 rd:5    &rr_i imm=%offs16
+ @hint_r_i12           .... ...... imm:s12 rj:5 hint:5    &hint_r_i
+ @rrr_sa2p1        .... ........ ... .. rk:5 rj:5 rd:5    &rrr_sa  sa=%sa2p1
+ @rrr_sa2        .... ........ ... sa:2 rk:5 rj:5 rd:5    &rrr_sa
+@@ -XXX,XX +XXX,XX @@ beqz            0100 00 ................ ..... .....     @r_offs21
+ bnez            0100 01 ................ ..... .....     @r_offs21
+ bceqz           0100 10 ................ 00 ... .....    @c_offs21
+ bcnez           0100 10 ................ 01 ... .....    @c_offs21
+-jirl            0100 11 ................ ..... .....     @rr_offs16
++jirl            0100 11 ................ ..... .....     @rr_i16s2
+ b               0101 00 ..........................       @offs26
+ bl              0101 01 ..........................       @offs26
+ beq             0101 10 ................ ..... .....     @rr_offs16
+diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/disas.c
++++ b/target/loongarch/disas.c
+@@ -XXX,XX +XXX,XX @@ INSN(beqz,         r_offs)
+ INSN(bnez,         r_offs)
+ INSN(bceqz,        c_offs)
+ INSN(bcnez,        c_offs)
+-INSN(jirl,         rr_offs)
++INSN(jirl,         rr_i)
+ INSN(b,            offs)
+ INSN(bl,           offs)
+ INSN(beq,          rr_offs)
+diff --git a/target/loongarch/insn_trans/trans_branch.c.inc b/target/loongarch/insn_trans/trans_branch.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/insn_trans/trans_branch.c.inc
++++ b/target/loongarch/insn_trans/trans_branch.c.inc
+@@ -XXX,XX +XXX,XX @@ static bool trans_jirl(DisasContext *ctx, arg_jirl *a)
+     TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE);
+     TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE);
+-    tcg_gen_addi_tl(cpu_pc, src1, a->offs);
++    tcg_gen_addi_tl(cpu_pc, src1, a->imm);
+     tcg_gen_movi_tl(dest, ctx->base.pc_next + 4);
+     gen_set_gpr(a->rd, dest, EXT_NONE);
+     tcg_gen_lookup_and_goto_ptr();
+--
+.34.1

-New patch
+[PULL v2 08/15] target/loongarch: Disassemble pcadd* addresses
+Print both the raw field and the resolved pc-relative
+address, as we do for branches.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/loongarch/disas.c | 37 +++++++++++++++++++++++++++++++++----
+file changed, 33 insertions(+), 4 deletions(-)
+diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/disas.c
++++ b/target/loongarch/disas.c
+@@ -XXX,XX +XXX,XX @@ INSN(fsel,         fffc)
+ INSN(addu16i_d,    rr_i)
+ INSN(lu12i_w,      r_i)
+ INSN(lu32i_d,      r_i)
+-INSN(pcaddi,       r_i)
+-INSN(pcalau12i,    r_i)
+-INSN(pcaddu12i,    r_i)
+-INSN(pcaddu18i,    r_i)
+ INSN(ll_w,         rr_i)
+ INSN(sc_w,         rr_i)
+ INSN(ll_d,         rr_i)
+@@ -XXX,XX +XXX,XX @@ static bool trans_fcmp_cond_##suffix(DisasContext *ctx, \
+ FCMP_INSN(s)
+ FCMP_INSN(d)
++
++#define PCADD_INSN(name)                                        \
++static bool trans_##name(DisasContext *ctx, arg_##name *a)      \
++{                                                               \
++    output(ctx, #name, "r%d, %d # 0x%" PRIx64,                  \
++           a->rd, a->imm, gen_##name(ctx->pc, a->imm));         \
++    return true;                                                \
++}
++
++static uint64_t gen_pcaddi(uint64_t pc, int imm)
++{
++    return pc + (imm << 2);
++}
++
++static uint64_t gen_pcalau12i(uint64_t pc, int imm)
++{
++    return (pc + (imm << 12)) & ~0xfff;
++}
++
++static uint64_t gen_pcaddu12i(uint64_t pc, int imm)
++{
++    return pc + (imm << 12);
++}
++
++static uint64_t gen_pcaddu18i(uint64_t pc, int imm)
++{
++    return pc + ((uint64_t)(imm) << 18);
++}
++
++PCADD_INSN(pcaddi)
++PCADD_INSN(pcalau12i)
++PCADD_INSN(pcaddu12i)
++PCADD_INSN(pcaddu18i)
+--
+.34.1

-New patch
+[PULL v2 09/15] tcg/loongarch64: Optimize immediate loading
+From: Rui Wang <wangrui@loongson.cn>
+diff:
+  Imm                 Before                  After
+  0000000000000000    addi.w  rd, zero, 0     addi.w  rd, zero, 0
+                      lu52i.d rd, zero, 0
+  00000000fffff800    lu12i.w rd, -1          addi.w  rd, zero, -2048
+                      ori     rd, rd, 2048    lu32i.d rd, 0
+                      lu32i.d rd, 0
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Rui Wang <wangrui@loongson.cn>
+Message-Id: <20221107144713.845550-1-wangrui@loongson.cn>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target.c.inc | 35 +++++++++++---------------------
+file changed, 12 insertions(+), 23 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+     return true;
+ }
+-static bool imm_part_needs_loading(bool high_bits_are_ones,
+-                                   tcg_target_long part)
+-{
+-    if (high_bits_are_ones) {
+-        return part != -1;
+-    } else {
+-        return part != 0;
+-    }
+-}
+-
+ /* Loads a 32-bit immediate into rd, sign-extended.  */
+ static void tcg_out_movi_i32(TCGContext *s, TCGReg rd, int32_t val)
+ {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_i32(TCGContext *s, TCGReg rd, int32_t val)
+     tcg_target_long hi12 = sextreg(val, 12, 20);
+     /* Single-instruction cases.  */
+-    if (lo == val) {
+-        /* val fits in simm12: addi.w rd, zero, val */
+-        tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, val);
+-        return;
+-    }
+-    if (0x800 <= val && val <= 0xfff) {
++    if (hi12 == 0) {
+         /* val fits in uimm12: ori rd, zero, val */
+         tcg_out_opc_ori(s, rd, TCG_REG_ZERO, val);
+         return;
+     }
++    if (hi12 == sextreg(lo, 12, 20)) {
++        /* val fits in simm12: addi.w rd, zero, val */
++        tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, val);
++        return;
++    }
+     /* High bits must be set; load with lu12i.w + optional ori.  */
+     tcg_out_opc_lu12i_w(s, rd, hi12);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+     intptr_t pc_offset;
+     tcg_target_long val_lo, val_hi, pc_hi, offset_hi;
+-    tcg_target_long hi32, hi52;
+-    bool rd_high_bits_are_ones;
++    tcg_target_long hi12, hi32, hi52;
+     /* Value fits in signed i32.  */
+     if (type == TCG_TYPE_I32 || val == (int32_t)val) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+         return;
+     }
++    hi12 = sextreg(val, 12, 20);
+     hi32 = sextreg(val, 32, 20);
+     hi52 = sextreg(val, 52, 12);
+     /* Single cu52i.d case.  */
+-    if (ctz64(val) >= 52) {
++    if ((hi52 != 0) && (ctz64(val) >= 52)) {
+         tcg_out_opc_cu52i_d(s, rd, TCG_REG_ZERO, hi52);
+         return;
+     }
+     /* Slow path.  Initialize the low 32 bits, then concat high bits.  */
+     tcg_out_movi_i32(s, rd, val);
+-    rd_high_bits_are_ones = (int32_t)val < 0;
+-    if (imm_part_needs_loading(rd_high_bits_are_ones, hi32)) {
++    /* Load hi32 and hi52 explicitly when they are unexpected values. */
++    if (hi32 != sextreg(hi12, 20, 20)) {
+         tcg_out_opc_cu32i_d(s, rd, hi32);
+-        rd_high_bits_are_ones = hi32 < 0;
+     }
+-    if (imm_part_needs_loading(rd_high_bits_are_ones, hi52)) {
++    if (hi52 != sextreg(hi32, 20, 12)) {
+         tcg_out_opc_cu52i_d(s, rd, rd, hi52);
+     }
+ }
+--
+.34.1

-[PULL 2/3] accel/tcg: split tcg_start_vcpu_thread
+[PULL v2 10/15] tcg/loongarch64: Update tcg-insn-defs.c.inc
-From: Claudio Fontana <cfontana@suse.de>
+Regenerate with ADDU16I included:
-after the initial split into 3 tcg variants, we proceed to also
+   $ cd loongarch-opcodes/scripts/go
-split tcg_start_vcpu_thread.
+   $ go run ./genqemutcgdefs > $QEMU/tcg/loongarch64/tcg-insn-defs.c.inc
-We actually split it in 2 this time, since the icount variant
+Reviewed-by: WANG Xuerui <git@xen0n.name>
-just uses the round robin function.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Suggested-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Message-Id: <20201015143217.29337-3-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-cpus-mttcg.h  | 21 --------------
+ tcg/loongarch64/tcg-insn-defs.c.inc | 10 +++++++++-
- accel/tcg/tcg-cpus-rr.h     |  3 +-
+file changed, 9 insertions(+), 1 deletion(-)
  accel/tcg/tcg-cpus.h        |  1 -
  accel/tcg/tcg-all.c         |  5 ++++
  accel/tcg/tcg-cpus-icount.c |  2 +-
  accel/tcg/tcg-cpus-mttcg.c  | 29 +++++++++++++++++--
  accel/tcg/tcg-cpus-rr.c     | 39 +++++++++++++++++++++++--
  accel/tcg/tcg-cpus.c        | 58 -------------------------------------
 files changed, 71 insertions(+), 87 deletions(-)
  delete mode 100644 accel/tcg/tcg-cpus-mttcg.h
-diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
+diff --git a/tcg/loongarch64/tcg-insn-defs.c.inc b/tcg/loongarch64/tcg-insn-defs.c.inc
-deleted file mode 100644
+index XXXXXXX..XXXXXXX 100644
-index XXXXXXX..XXXXXXX
+--- a/tcg/loongarch64/tcg-insn-defs.c.inc
---- a/accel/tcg/tcg-cpus-mttcg.h
++++ b/tcg/loongarch64/tcg-insn-defs.c.inc
 +++ /dev/null
 @@ -XXX,XX +XXX,XX @@
--/*
+  *
-- * QEMU TCG Multi Threaded vCPUs implementation
+  * This file is auto-generated by genqemutcgdefs from
-- *
+  * https://github.com/loongson-community/loongarch-opcodes,
-- * Copyright 2020 SUSE LLC
+- * from commit 961f0c60f5b63e574d785995600c71ad5413fdc4.
-- *
++ * from commit 25ca7effe9d88101c1cf96c4005423643386d81f.
-- * This work is licensed under the terms of the GNU GPL, version 2 or later.
+  * DO NOT EDIT.
-- * See the COPYING file in the top-level directory.
+  */
-- */
--
+@@ -XXX,XX +XXX,XX @@ typedef enum {
--#ifndef TCG_CPUS_MTTCG_H
+     OPC_ANDI = 0x03400000,
--#define TCG_CPUS_MTTCG_H
+     OPC_ORI = 0x03800000,
--
+     OPC_XORI = 0x03c00000,
--/*
++    OPC_ADDU16I_D = 0x10000000,
-- * In the multi-threaded case each vCPU has its own thread. The TLS
+     OPC_LU12I_W = 0x14000000,
-- * variable current_cpu can be used deep in the code to find the
+     OPC_CU32I_D = 0x16000000,
-- * current CPUState for a given thread.
+     OPC_PCADDU2I = 0x18000000,
-- */
+@@ -XXX,XX +XXX,XX @@ tcg_out_opc_xori(TCGContext *s, TCGReg d, TCGReg j, uint32_t uk12)
--
+     tcg_out32(s, encode_djuk12_insn(OPC_XORI, d, j, uk12));
 -void *tcg_cpu_thread_fn(void *arg);
 -
 -#endif /* TCG_CPUS_MTTCG_H */
 diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-rr.h
 +++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
  /* Kick all RR vCPUs. */
  void qemu_cpu_kick_rr_cpus(CPUState *unused);
 -void *tcg_rr_cpu_thread_fn(void *arg);
 +/* start the round robin vcpu thread */
 +void rr_start_vcpu_thread(CPUState *cpu);
  #endif /* TCG_CPUS_RR_H */
 diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.h
 +++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
  extern const CpusAccel tcg_cpus_icount;
  extern const CpusAccel tcg_cpus_rr;
 -void tcg_start_vcpu_thread(CPUState *cpu);
  void qemu_tcg_destroy_vcpu(CPUState *cpu);
  int tcg_cpu_exec(CPUState *cpu);
  void tcg_handle_interrupt(CPUState *cpu, int mask);
 diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-all.c
 +++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
      tcg_exec_init(s->tb_size * 1024 * 1024);
      mttcg_enabled = s->mttcg_enabled;
 +    /*
 +     * Initialize TCG regions
 +     */
 +    tcg_region_init();
 +
      if (mttcg_enabled) {
          cpus_register_accel(&tcg_cpus_mttcg);
      } else if (icount_enabled()) {
 diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-icount.c
 +++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
  }
- const CpusAccel tcg_cpus_icount = {
++/* Emits the `addu16i.d d, j, sk16` instruction.  */
--    .create_vcpu_thread = tcg_start_vcpu_thread,
++static void __attribute__((unused))
-+    .create_vcpu_thread = rr_start_vcpu_thread,
++tcg_out_opc_addu16i_d(TCGContext *s, TCGReg d, TCGReg j, int32_t sk16)
      .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
      .handle_interrupt = icount_handle_interrupt,
 diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-mttcg.c
 +++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/boards.h"
  #include "tcg-cpus.h"
 -#include "tcg-cpus-mttcg.h"
  /*
   * In the multi-threaded case each vCPU has its own thread. The TLS
@@ -XXX,XX +XXX,XX @@
   * current CPUState for a given thread.
   */
 -void *tcg_cpu_thread_fn(void *arg)
 +static void *tcg_cpu_thread_fn(void *arg)
  {
      CPUState *cpu = arg;
@@ -XXX,XX +XXX,XX @@ static void mttcg_kick_vcpu_thread(CPUState *cpu)
      cpu_exit(cpu);
  }
 +static void mttcg_start_vcpu_thread(CPUState *cpu)
 +{
-+    char thread_name[VCPU_THREAD_NAME_SIZE];
++    tcg_out32(s, encode_djsk16_insn(OPC_ADDU16I_D, d, j, sk16));
 +
 +    g_assert(tcg_enabled());
 +
 +    parallel_cpus = (current_machine->smp.max_cpus > 1);
 +
 +    cpu->thread = g_malloc0(sizeof(QemuThread));
 +    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 +    qemu_cond_init(cpu->halt_cond);
 +
 +    /* create a thread per vCPU with TCG (MTTCG) */
 +    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
 +             cpu->cpu_index);
 +
 +    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
 +                       cpu, QEMU_THREAD_JOINABLE);
 +
 +#ifdef _WIN32
 +    cpu->hThread = qemu_thread_get_handle(cpu->thread);
 +#endif
 +}
 +
- const CpusAccel tcg_cpus_mttcg = {
+ /* Emits the `lu12i.w d, sj20` instruction.  */
--    .create_vcpu_thread = tcg_start_vcpu_thread,
+ static void __attribute__((unused))
-+    .create_vcpu_thread = mttcg_start_vcpu_thread,
+ tcg_out_opc_lu12i_w(TCGContext *s, TCGReg d, int32_t sj20)
      .kick_vcpu_thread = mttcg_kick_vcpu_thread,
      .handle_interrupt = tcg_handle_interrupt,
 diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-rr.c
 +++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
   * elsewhere.
   */
 -void *tcg_rr_cpu_thread_fn(void *arg)
 +static void *tcg_rr_cpu_thread_fn(void *arg)
  {
      CPUState *cpu = arg;
@@ -XXX,XX +XXX,XX @@ void *tcg_rr_cpu_thread_fn(void *arg)
      return NULL;
  }
 +void rr_start_vcpu_thread(CPUState *cpu)
 +{
 +    char thread_name[VCPU_THREAD_NAME_SIZE];
 +    static QemuCond *single_tcg_halt_cond;
 +    static QemuThread *single_tcg_cpu_thread;
 +
 +    g_assert(tcg_enabled());
 +    parallel_cpus = false;
 +
 +    if (!single_tcg_cpu_thread) {
 +        cpu->thread = g_malloc0(sizeof(QemuThread));
 +        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 +        qemu_cond_init(cpu->halt_cond);
 +
 +        /* share a single thread for all cpus with TCG */
 +        snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
 +        qemu_thread_create(cpu->thread, thread_name,
 +                           tcg_rr_cpu_thread_fn,
 +                           cpu, QEMU_THREAD_JOINABLE);
 +
 +        single_tcg_halt_cond = cpu->halt_cond;
 +        single_tcg_cpu_thread = cpu->thread;
 +#ifdef _WIN32
 +        cpu->hThread = qemu_thread_get_handle(cpu->thread);
 +#endif
 +    } else {
 +        /* we share the thread */
 +        cpu->thread = single_tcg_cpu_thread;
 +        cpu->halt_cond = single_tcg_halt_cond;
 +        cpu->thread_id = first_cpu->thread_id;
 +        cpu->can_do_io = 1;
 +        cpu->created = true;
 +    }
 +}
 +
  const CpusAccel tcg_cpus_rr = {
 -    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .create_vcpu_thread = rr_start_vcpu_thread,
      .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
      .handle_interrupt = tcg_handle_interrupt,
 diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.c
 +++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/boards.h"
  #include "tcg-cpus.h"
 -#include "tcg-cpus-mttcg.h"
 -#include "tcg-cpus-rr.h"
  /* common functionality among all TCG variants */
 -void tcg_start_vcpu_thread(CPUState *cpu)
 -{
 -    char thread_name[VCPU_THREAD_NAME_SIZE];
 -    static QemuCond *single_tcg_halt_cond;
 -    static QemuThread *single_tcg_cpu_thread;
 -    static int tcg_region_inited;
 -
 -    assert(tcg_enabled());
 -    /*
 -     * Initialize TCG regions--once. Now is a good time, because:
 -     * (1) TCG's init context, prologue and target globals have been set up.
 -     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
 -     *     -accel flag is processed, so the check doesn't work then).
 -     */
 -    if (!tcg_region_inited) {
 -        tcg_region_inited = 1;
 -        tcg_region_init();
 -        parallel_cpus = qemu_tcg_mttcg_enabled() && current_machine->smp.max_cpus > 1;
 -    }
 -
 -    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
 -        cpu->thread = g_malloc0(sizeof(QemuThread));
 -        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 -        qemu_cond_init(cpu->halt_cond);
 -
 -        if (qemu_tcg_mttcg_enabled()) {
 -            /* create a thread per vCPU with TCG (MTTCG) */
 -            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
 -                 cpu->cpu_index);
 -
 -            qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
 -                               cpu, QEMU_THREAD_JOINABLE);
 -
 -        } else {
 -            /* share a single thread for all cpus with TCG */
 -            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
 -            qemu_thread_create(cpu->thread, thread_name,
 -                               tcg_rr_cpu_thread_fn,
 -                               cpu, QEMU_THREAD_JOINABLE);
 -
 -            single_tcg_halt_cond = cpu->halt_cond;
 -            single_tcg_cpu_thread = cpu->thread;
 -        }
 -#ifdef _WIN32
 -        cpu->hThread = qemu_thread_get_handle(cpu->thread);
 -#endif
 -    } else {
 -        /* For non-MTTCG cases we share the thread */
 -        cpu->thread = single_tcg_cpu_thread;
 -        cpu->halt_cond = single_tcg_halt_cond;
 -        cpu->thread_id = first_cpu->thread_id;
 -        cpu->can_do_io = 1;
 -        cpu->created = true;
 -    }
 -}
 -
  void qemu_tcg_destroy_vcpu(CPUState *cpu)
  {
      cpu_thread_signal_destroyed(cpu);
 --
-.25.1
+.34.1

-New patch
+[PULL v2 11/15] tcg/loongarch64: Introduce tcg_out_addi
+Adjust the constraints to allow any int32_t for immediate
+addition.  Split immediate adds into addu16i + addi, which
+covers quite a lot of the immediate space.  For the hole in
+the middle, load the constant into TMP0 instead.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target-con-set.h |  4 +-
+ tcg/loongarch64/tcg-target-con-str.h |  2 +-
+ tcg/loongarch64/tcg-target.c.inc     | 57 ++++++++++++++++++++++++----
+files changed, 53 insertions(+), 10 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target-con-set.h
++++ b/tcg/loongarch64/tcg-target-con-set.h
+@@ -XXX,XX +XXX,XX @@ C_O1_I1(r, L)
+ C_O1_I2(r, r, rC)
+ C_O1_I2(r, r, ri)
+ C_O1_I2(r, r, rI)
++C_O1_I2(r, r, rJ)
+ C_O1_I2(r, r, rU)
+ C_O1_I2(r, r, rW)
+ C_O1_I2(r, r, rZ)
+ C_O1_I2(r, 0, rZ)
+-C_O1_I2(r, rZ, rN)
++C_O1_I2(r, rZ, ri)
++C_O1_I2(r, rZ, rJ)
+ C_O1_I2(r, rZ, rZ)
+diff --git a/tcg/loongarch64/tcg-target-con-str.h b/tcg/loongarch64/tcg-target-con-str.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target-con-str.h
++++ b/tcg/loongarch64/tcg-target-con-str.h
+@@ -XXX,XX +XXX,XX @@ REGS('L', ALL_GENERAL_REGS & ~SOFTMMU_RESERVE_REGS)
+  * CONST(letter, TCG_CT_CONST_* bit set)
+  */
+ CONST('I', TCG_CT_CONST_S12)
+-CONST('N', TCG_CT_CONST_N12)
++CONST('J', TCG_CT_CONST_S32)
+ CONST('U', TCG_CT_CONST_U12)
+ CONST('Z', TCG_CT_CONST_ZERO)
+ CONST('C', TCG_CT_CONST_C12)
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[] = {
+ #define TCG_CT_CONST_ZERO  0x100
+ #define TCG_CT_CONST_S12   0x200
+-#define TCG_CT_CONST_N12   0x400
++#define TCG_CT_CONST_S32   0x400
+ #define TCG_CT_CONST_U12   0x800
+ #define TCG_CT_CONST_C12   0x1000
+ #define TCG_CT_CONST_WSZ   0x2000
+@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+     if ((ct & TCG_CT_CONST_S12) && val == sextreg(val, 0, 12)) {
+         return true;
+     }
+-    if ((ct & TCG_CT_CONST_N12) && -val == sextreg(-val, 0, 12)) {
++    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
+         return true;
+     }
+     if ((ct & TCG_CT_CONST_U12) && val >= 0 && val <= 0xfff) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+     }
+ }
++static void tcg_out_addi(TCGContext *s, TCGType type, TCGReg rd,
++                         TCGReg rs, tcg_target_long imm)
++{
++    tcg_target_long lo12 = sextreg(imm, 0, 12);
++    tcg_target_long hi16 = sextreg(imm - lo12, 16, 16);
++
++    /*
++     * Note that there's a hole in between hi16 and lo12:
++     *
++     *       3                   2                   1                   0
++     *     1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
++     * ...+-------------------------------+-------+-----------------------+
++     *    |             hi16              |       |          lo12         |
++     * ...+-------------------------------+-------+-----------------------+
++     *
++     * For bits within that hole, it's more efficient to use LU12I and ADD.
++     */
++    if (imm == (hi16 << 16) + lo12) {
++        if (hi16) {
++            tcg_out_opc_addu16i_d(s, rd, rs, hi16);
++            rs = rd;
++        }
++        if (type == TCG_TYPE_I32) {
++            tcg_out_opc_addi_w(s, rd, rs, lo12);
++        } else if (lo12) {
++            tcg_out_opc_addi_d(s, rd, rs, lo12);
++        } else {
++            tcg_out_mov(s, type, rd, rs);
++        }
++    } else {
++        tcg_out_movi(s, type, TCG_REG_TMP0, imm);
++        if (type == TCG_TYPE_I32) {
++            tcg_out_opc_add_w(s, rd, rs, TCG_REG_TMP0);
++        } else {
++            tcg_out_opc_add_d(s, rd, rs, TCG_REG_TMP0);
++        }
++    }
++}
++
+ static void tcg_out_ext8u(TCGContext *s, TCGReg ret, TCGReg arg)
+ {
+     tcg_out_opc_andi(s, ret, arg, 0xff);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_add_i32:
+         if (c2) {
+-            tcg_out_opc_addi_w(s, a0, a1, a2);
++            tcg_out_addi(s, TCG_TYPE_I32, a0, a1, a2);
+         } else {
+             tcg_out_opc_add_w(s, a0, a1, a2);
+         }
+         break;
+     case INDEX_op_add_i64:
+         if (c2) {
+-            tcg_out_opc_addi_d(s, a0, a1, a2);
++            tcg_out_addi(s, TCG_TYPE_I64, a0, a1, a2);
+         } else {
+             tcg_out_opc_add_d(s, a0, a1, a2);
+         }
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_sub_i32:
+         if (c2) {
+-            tcg_out_opc_addi_w(s, a0, a1, -a2);
++            tcg_out_addi(s, TCG_TYPE_I32, a0, a1, -a2);
+         } else {
+             tcg_out_opc_sub_w(s, a0, a1, a2);
+         }
+         break;
+     case INDEX_op_sub_i64:
+         if (c2) {
+-            tcg_out_opc_addi_d(s, a0, a1, -a2);
++            tcg_out_addi(s, TCG_TYPE_I64, a0, a1, -a2);
+         } else {
+             tcg_out_opc_sub_d(s, a0, a1, a2);
+         }
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+         return C_O1_I2(r, r, ri);
+     case INDEX_op_add_i32:
++        return C_O1_I2(r, r, ri);
+     case INDEX_op_add_i64:
+-        return C_O1_I2(r, r, rI);
++        return C_O1_I2(r, r, rJ);
+     case INDEX_op_and_i32:
+     case INDEX_op_and_i64:
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+         return C_O1_I2(r, 0, rZ);
+     case INDEX_op_sub_i32:
++        return C_O1_I2(r, rZ, ri);
+     case INDEX_op_sub_i64:
+-        return C_O1_I2(r, rZ, rN);
++        return C_O1_I2(r, rZ, rJ);
+     case INDEX_op_mul_i32:
+     case INDEX_op_mul_i64:
+--
+.34.1

-New patch
+[PULL v2 12/15] tcg/loongarch64: Improve setcond expansion
+Split out a helper function, tcg_out_setcond_int, which
+does not always produce the complete boolean result, but
+returns a set of flags to do so.
+Accept all int32_t as constant input, so that LE/GT can
+adjust the constant to LT.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target.c.inc | 165 +++++++++++++++++++++----------
+file changed, 115 insertions(+), 50 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_clzctz(TCGContext *s, LoongArchInsn opc,
+     tcg_out_opc_or(s, a0, TCG_REG_TMP0, a0);
+ }
+-static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
+-                            TCGReg arg1, TCGReg arg2, bool c2)
+-{
+-    TCGReg tmp;
++#define SETCOND_INV    TCG_TARGET_NB_REGS
++#define SETCOND_NEZ    (SETCOND_INV << 1)
++#define SETCOND_FLAGS  (SETCOND_INV | SETCOND_NEZ)
+-    if (c2) {
+-        tcg_debug_assert(arg2 == 0);
++static int tcg_out_setcond_int(TCGContext *s, TCGCond cond, TCGReg ret,
++                               TCGReg arg1, tcg_target_long arg2, bool c2)
++{
++    int flags = 0;
++
++    switch (cond) {
++    case TCG_COND_EQ:    /* -> NE  */
++    case TCG_COND_GE:    /* -> LT  */
++    case TCG_COND_GEU:   /* -> LTU */
++    case TCG_COND_GT:    /* -> LE  */
++    case TCG_COND_GTU:   /* -> LEU */
++        cond = tcg_invert_cond(cond);
++        flags ^= SETCOND_INV;
++        break;
++    default:
++        break;
+     }
+     switch (cond) {
+-    case TCG_COND_EQ:
+-        if (c2) {
+-            tmp = arg1;
+-        } else {
+-            tcg_out_opc_sub_d(s, ret, arg1, arg2);
+-            tmp = ret;
+-        }
+-        tcg_out_opc_sltui(s, ret, tmp, 1);
+-        break;
+-    case TCG_COND_NE:
+-        if (c2) {
+-            tmp = arg1;
+-        } else {
+-            tcg_out_opc_sub_d(s, ret, arg1, arg2);
+-            tmp = ret;
+-        }
+-        tcg_out_opc_sltu(s, ret, TCG_REG_ZERO, tmp);
+-        break;
+-    case TCG_COND_LT:
+-        tcg_out_opc_slt(s, ret, arg1, arg2);
+-        break;
+-    case TCG_COND_GE:
+-        tcg_out_opc_slt(s, ret, arg1, arg2);
+-        tcg_out_opc_xori(s, ret, ret, 1);
+-        break;
+     case TCG_COND_LE:
+-        tcg_out_setcond(s, TCG_COND_GE, ret, arg2, arg1, false);
+-        break;
+-    case TCG_COND_GT:
+-        tcg_out_setcond(s, TCG_COND_LT, ret, arg2, arg1, false);
+-        break;
+-    case TCG_COND_LTU:
+-        tcg_out_opc_sltu(s, ret, arg1, arg2);
+-        break;
+-    case TCG_COND_GEU:
+-        tcg_out_opc_sltu(s, ret, arg1, arg2);
+-        tcg_out_opc_xori(s, ret, ret, 1);
+-        break;
+     case TCG_COND_LEU:
+-        tcg_out_setcond(s, TCG_COND_GEU, ret, arg2, arg1, false);
++        /*
++         * If we have a constant input, the most efficient way to implement
++         * LE is by adding 1 and using LT.  Watch out for wrap around for LEU.
++         * We don't need to care for this for LE because the constant input
++         * is still constrained to int32_t, and INT32_MAX+1 is representable
++         * in the 64-bit temporary register.
++         */
++        if (c2) {
++            if (cond == TCG_COND_LEU) {
++                /* unsigned <= -1 is true */
++                if (arg2 == -1) {
++                    tcg_out_movi(s, TCG_TYPE_REG, ret, !(flags & SETCOND_INV));
++                    return ret;
++                }
++                cond = TCG_COND_LTU;
++            } else {
++                cond = TCG_COND_LT;
++            }
++            arg2 += 1;
++        } else {
++            TCGReg tmp = arg2;
++            arg2 = arg1;
++            arg1 = tmp;
++            cond = tcg_swap_cond(cond);    /* LE -> GE */
++            cond = tcg_invert_cond(cond);  /* GE -> LT */
++            flags ^= SETCOND_INV;
++        }
+         break;
+-    case TCG_COND_GTU:
+-        tcg_out_setcond(s, TCG_COND_LTU, ret, arg2, arg1, false);
++    default:
+         break;
++    }
++
++    switch (cond) {
++    case TCG_COND_NE:
++        flags |= SETCOND_NEZ;
++        if (!c2) {
++            tcg_out_opc_xor(s, ret, arg1, arg2);
++        } else if (arg2 == 0) {
++            ret = arg1;
++        } else if (arg2 >= 0 && arg2 <= 0xfff) {
++            tcg_out_opc_xori(s, ret, arg1, arg2);
++        } else {
++            tcg_out_addi(s, TCG_TYPE_REG, ret, arg1, -arg2);
++        }
++        break;
++
++    case TCG_COND_LT:
++    case TCG_COND_LTU:
++        if (c2) {
++            if (arg2 >= -0x800 && arg2 <= 0x7ff) {
++                if (cond == TCG_COND_LT) {
++                    tcg_out_opc_slti(s, ret, arg1, arg2);
++                } else {
++                    tcg_out_opc_sltui(s, ret, arg1, arg2);
++                }
++                break;
++            }
++            tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_TMP0, arg2);
++            arg2 = TCG_REG_TMP0;
++        }
++        if (cond == TCG_COND_LT) {
++            tcg_out_opc_slt(s, ret, arg1, arg2);
++        } else {
++            tcg_out_opc_sltu(s, ret, arg1, arg2);
++        }
++        break;
++
+     default:
+         g_assert_not_reached();
+         break;
+     }
++
++    return ret | flags;
++}
++
++static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
++                            TCGReg arg1, tcg_target_long arg2, bool c2)
++{
++    int tmpflags = tcg_out_setcond_int(s, cond, ret, arg1, arg2, c2);
++
++    if (tmpflags != ret) {
++        TCGReg tmp = tmpflags & ~SETCOND_FLAGS;
++
++        switch (tmpflags & SETCOND_FLAGS) {
++        case SETCOND_INV:
++            /* Intermediate result is boolean: simply invert. */
++            tcg_out_opc_xori(s, ret, tmp, 1);
++            break;
++        case SETCOND_NEZ:
++            /* Intermediate result is zero/non-zero: test != 0. */
++            tcg_out_opc_sltu(s, ret, TCG_REG_ZERO, tmp);
++            break;
++        case SETCOND_NEZ | SETCOND_INV:
++            /* Intermediate result is zero/non-zero: test == 0. */
++            tcg_out_opc_sltui(s, ret, tmp, 1);
++            break;
++        default:
++            g_assert_not_reached();
++        }
++    }
+ }
+ /*
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_ctz_i64:
+         return C_O1_I2(r, r, rW);
+-    case INDEX_op_setcond_i32:
+-    case INDEX_op_setcond_i64:
+-        return C_O1_I2(r, r, rZ);
+-
+     case INDEX_op_deposit_i32:
+     case INDEX_op_deposit_i64:
+         /* Must deposit into the same register as input */
+         return C_O1_I2(r, 0, rZ);
+     case INDEX_op_sub_i32:
++    case INDEX_op_setcond_i32:
+         return C_O1_I2(r, rZ, ri);
+     case INDEX_op_sub_i64:
++    case INDEX_op_setcond_i64:
+         return C_O1_I2(r, rZ, rJ);
+     case INDEX_op_mul_i32:
+--
+.34.1

-New patch
+[PULL v2 13/15] tcg/loongarch64: Implement movcond
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target-con-set.h |  1 +
+ tcg/loongarch64/tcg-target.h         |  4 ++--
+ tcg/loongarch64/tcg-target.c.inc     | 33 ++++++++++++++++++++++++++++
+files changed, 36 insertions(+), 2 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target-con-set.h
++++ b/tcg/loongarch64/tcg-target-con-set.h
+@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, 0, rZ)
+ C_O1_I2(r, rZ, ri)
+ C_O1_I2(r, rZ, rJ)
+ C_O1_I2(r, rZ, rZ)
++C_O1_I4(r, rZ, rJ, rZ, rZ)
+diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.h
++++ b/tcg/loongarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_NORMAL
+ /* optional instructions */
+-#define TCG_TARGET_HAS_movcond_i32      0
++#define TCG_TARGET_HAS_movcond_i32      1
+ #define TCG_TARGET_HAS_div_i32          1
+ #define TCG_TARGET_HAS_rem_i32          1
+ #define TCG_TARGET_HAS_div2_i32         0
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_HAS_qemu_st8_i32     0
+ /* 64-bit operations */
+-#define TCG_TARGET_HAS_movcond_i64      0
++#define TCG_TARGET_HAS_movcond_i64      1
+ #define TCG_TARGET_HAS_div_i64          1
+ #define TCG_TARGET_HAS_rem_i64          1
+ #define TCG_TARGET_HAS_div2_i64         0
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
+     }
+ }
++static void tcg_out_movcond(TCGContext *s, TCGCond cond, TCGReg ret,
++                            TCGReg c1, tcg_target_long c2, bool const2,
++                            TCGReg v1, TCGReg v2)
++{
++    int tmpflags = tcg_out_setcond_int(s, cond, TCG_REG_TMP0, c1, c2, const2);
++    TCGReg t;
++
++    /* Standardize the test below to t != 0. */
++    if (tmpflags & SETCOND_INV) {
++        t = v1, v1 = v2, v2 = t;
++    }
++
++    t = tmpflags & ~SETCOND_FLAGS;
++    if (v1 == TCG_REG_ZERO) {
++        tcg_out_opc_masknez(s, ret, v2, t);
++    } else if (v2 == TCG_REG_ZERO) {
++        tcg_out_opc_maskeqz(s, ret, v1, t);
++    } else {
++        tcg_out_opc_masknez(s, TCG_REG_TMP2, v2, t); /* t ? 0 : v2 */
++        tcg_out_opc_maskeqz(s, TCG_REG_TMP1, v1, t); /* t ? v1 : 0 */
++        tcg_out_opc_or(s, ret, TCG_REG_TMP1, TCG_REG_TMP2);
++    }
++}
++
+ /*
+  * Branch helpers
+  */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+         tcg_out_setcond(s, args[3], a0, a1, a2, c2);
+         break;
++    case INDEX_op_movcond_i32:
++    case INDEX_op_movcond_i64:
++        tcg_out_movcond(s, args[5], a0, a1, a2, c2, args[3], args[4]);
++        break;
++
+     case INDEX_op_ld8s_i32:
+     case INDEX_op_ld8s_i64:
+         tcg_out_ldst(s, OPC_LD_B, a0, a1, a2);
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_remu_i64:
+         return C_O1_I2(r, rZ, rZ);
++    case INDEX_op_movcond_i32:
++    case INDEX_op_movcond_i64:
++        return C_O1_I4(r, rZ, rJ, rZ, rZ);
++
+     default:
+         g_assert_not_reached();
+     }
+--
+.34.1

-[PULL 3/3] accel/tcg: rename tcg-cpus functions to match module name
+[PULL v2 14/15] tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
-From: Claudio Fontana <cfontana@suse.de>
+Take the w^x split into account when computing the
 pc-relative distance to an absolute pointer.
-Signed-off-by: Claudio Fontana <cfontana@suse.de>
+Reviewed-by: WANG Xuerui <git@xen0n.name>
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Message-Id: <20201015143217.29337-4-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-cpus-icount.h |  6 +--
+ tcg/loongarch64/tcg-target.c.inc | 2 +-
- accel/tcg/tcg-cpus-rr.h     |  2 +-
+file changed, 1 insertion(+), 1 deletion(-)
  accel/tcg/tcg-cpus.h        |  6 +--
  accel/tcg/tcg-cpus-icount.c | 24 ++++++------
  accel/tcg/tcg-cpus-mttcg.c  | 10 ++---
  accel/tcg/tcg-cpus-rr.c     | 74 ++++++++++++++++++-------------------
  accel/tcg/tcg-cpus.c        |  6 +--
 files changed, 64 insertions(+), 64 deletions(-)
-diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-cpus-icount.h
+--- a/tcg/loongarch64/tcg-target.c.inc
-+++ b/accel/tcg/tcg-cpus-icount.h
++++ b/tcg/loongarch64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_ldst(TCGContext *s, LoongArchInsn opc, TCGReg data,
- #ifndef TCG_CPUS_ICOUNT_H
+     intptr_t imm12 = sextreg(offset, 0, 12);
- #define TCG_CPUS_ICOUNT_H
+     if (offset != imm12) {
--void handle_icount_deadline(void);
+-        intptr_t diff = offset - (uintptr_t)s->code_ptr;
--void prepare_icount_for_run(CPUState *cpu);
++        intptr_t diff = tcg_pcrel_diff(s, (void *)offset);
--void process_icount_data(CPUState *cpu);
-+void icount_handle_deadline(void);
+         if (addr == TCG_REG_ZERO && diff == (int32_t)diff) {
-+void icount_prepare_for_run(CPUState *cpu);
+             imm12 = sextreg(diff, 0, 12);
 +void icount_process_data(CPUState *cpu);
  #endif /* TCG_CPUS_ICOUNT_H */
 diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-rr.h
 +++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
  #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
  /* Kick all RR vCPUs. */
 -void qemu_cpu_kick_rr_cpus(CPUState *unused);
 +void rr_kick_vcpu_thread(CPUState *unused);
  /* start the round robin vcpu thread */
  void rr_start_vcpu_thread(CPUState *cpu);
 diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.h
 +++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
  extern const CpusAccel tcg_cpus_icount;
  extern const CpusAccel tcg_cpus_rr;
 -void qemu_tcg_destroy_vcpu(CPUState *cpu);
 -int tcg_cpu_exec(CPUState *cpu);
 -void tcg_handle_interrupt(CPUState *cpu, int mask);
 +void tcg_cpus_destroy(CPUState *cpu);
 +int tcg_cpus_exec(CPUState *cpu);
 +void tcg_cpus_handle_interrupt(CPUState *cpu, int mask);
  #endif /* TCG_CPUS_H */
 diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-icount.c
 +++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg-cpus-icount.h"
  #include "tcg-cpus-rr.h"
 -static int64_t tcg_get_icount_limit(void)
 +static int64_t icount_get_limit(void)
  {
      int64_t deadline;
@@ -XXX,XX +XXX,XX @@ static int64_t tcg_get_icount_limit(void)
      }
  }
 -static void notify_aio_contexts(void)
 +static void icount_notify_aio_contexts(void)
  {
      /* Wake up other AioContexts.  */
      qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
      qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
  }
 -void handle_icount_deadline(void)
 +void icount_handle_deadline(void)
  {
      assert(qemu_in_vcpu_thread());
      int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
                                                    QEMU_TIMER_ATTR_ALL);
      if (deadline == 0) {
 -        notify_aio_contexts();
 +        icount_notify_aio_contexts();
      }
  }
 -void prepare_icount_for_run(CPUState *cpu)
 +void icount_prepare_for_run(CPUState *cpu)
  {
      int insns_left;
      /*
 -     * These should always be cleared by process_icount_data after
 +     * These should always be cleared by icount_process_data after
       * each vCPU execution. However u16.high can be raised
 -     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
 +     * asynchronously by cpu_exit/cpu_interrupt/tcg_cpus_handle_interrupt
       */
      g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
      g_assert(cpu->icount_extra == 0);
 -    cpu->icount_budget = tcg_get_icount_limit();
 +    cpu->icount_budget = icount_get_limit();
      insns_left = MIN(0xffff, cpu->icount_budget);
      cpu_neg(cpu)->icount_decr.u16.low = insns_left;
      cpu->icount_extra = cpu->icount_budget - insns_left;
@@ -XXX,XX +XXX,XX @@ void prepare_icount_for_run(CPUState *cpu)
      replay_mutex_lock();
      if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
 -        notify_aio_contexts();
 +        icount_notify_aio_contexts();
      }
  }
 -void process_icount_data(CPUState *cpu)
 +void icount_process_data(CPUState *cpu)
  {
      /* Account for executed instructions */
      icount_update(cpu);
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
  {
      int old_mask = cpu->interrupt_request;
 -    tcg_handle_interrupt(cpu, mask);
 +    tcg_cpus_handle_interrupt(cpu, mask);
      if (qemu_cpu_is_self(cpu) &&
          !cpu->can_do_io
          && (mask & ~old_mask) != 0) {
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
  const CpusAccel tcg_cpus_icount = {
      .create_vcpu_thread = rr_start_vcpu_thread,
 -    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 +    .kick_vcpu_thread = rr_kick_vcpu_thread,
      .handle_interrupt = icount_handle_interrupt,
      .get_virtual_clock = icount_get,
 diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-mttcg.c
 +++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
   * current CPUState for a given thread.
   */
 -static void *tcg_cpu_thread_fn(void *arg)
 +static void *mttcg_cpu_thread_fn(void *arg)
  {
      CPUState *cpu = arg;
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
          if (cpu_can_run(cpu)) {
              int r;
              qemu_mutex_unlock_iothread();
 -            r = tcg_cpu_exec(cpu);
 +            r = tcg_cpus_exec(cpu);
              qemu_mutex_lock_iothread();
              switch (r) {
              case EXCP_DEBUG:
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
          qemu_wait_io_event(cpu);
      } while (!cpu->unplug || cpu_can_run(cpu));
 -    qemu_tcg_destroy_vcpu(cpu);
 +    tcg_cpus_destroy(cpu);
      qemu_mutex_unlock_iothread();
      rcu_unregister_thread();
      return NULL;
@@ -XXX,XX +XXX,XX @@ static void mttcg_start_vcpu_thread(CPUState *cpu)
      snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
               cpu->cpu_index);
 -    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
 +    qemu_thread_create(cpu->thread, thread_name, mttcg_cpu_thread_fn,
                         cpu, QEMU_THREAD_JOINABLE);
  #ifdef _WIN32
@@ -XXX,XX +XXX,XX @@ const CpusAccel tcg_cpus_mttcg = {
      .create_vcpu_thread = mttcg_start_vcpu_thread,
      .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 -    .handle_interrupt = tcg_handle_interrupt,
 +    .handle_interrupt = tcg_cpus_handle_interrupt,
  };
 diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-rr.c
 +++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg-cpus-icount.h"
  /* Kick all RR vCPUs */
 -void qemu_cpu_kick_rr_cpus(CPUState *unused)
 +void rr_kick_vcpu_thread(CPUState *unused)
  {
      CPUState *cpu;
@@ -XXX,XX +XXX,XX @@ void qemu_cpu_kick_rr_cpus(CPUState *unused)
   * idleness is complete.
   */
 -static QEMUTimer *tcg_kick_vcpu_timer;
 -static CPUState *tcg_current_rr_cpu;
 +static QEMUTimer *rr_kick_vcpu_timer;
 +static CPUState *rr_current_cpu;
  #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 -static inline int64_t qemu_tcg_next_kick(void)
 +static inline int64_t rr_next_kick_time(void)
  {
      return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
  }
  /* Kick the currently round-robin scheduled vCPU to next */
 -static void qemu_cpu_kick_rr_next_cpu(void)
 +static void rr_kick_next_cpu(void)
  {
      CPUState *cpu;
      do {
 -        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
 +        cpu = qatomic_mb_read(&rr_current_cpu);
          if (cpu) {
              cpu_exit(cpu);
          }
 -    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
 +    } while (cpu != qatomic_mb_read(&rr_current_cpu));
  }
 -static void kick_tcg_thread(void *opaque)
 +static void rr_kick_thread(void *opaque)
  {
 -    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 -    qemu_cpu_kick_rr_next_cpu();
 +    timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
 +    rr_kick_next_cpu();
  }
 -static void start_tcg_kick_timer(void)
 +static void rr_start_kick_timer(void)
  {
 -    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 -        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 -                                           kick_tcg_thread, NULL);
 +    if (!rr_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 +        rr_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 +                                           rr_kick_thread, NULL);
      }
 -    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 +    if (rr_kick_vcpu_timer && !timer_pending(rr_kick_vcpu_timer)) {
 +        timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
      }
  }
 -static void stop_tcg_kick_timer(void)
 +static void rr_stop_kick_timer(void)
  {
 -    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_del(tcg_kick_vcpu_timer);
 +    if (rr_kick_vcpu_timer && timer_pending(rr_kick_vcpu_timer)) {
 +        timer_del(rr_kick_vcpu_timer);
      }
  }
 -static void qemu_tcg_rr_wait_io_event(void)
 +static void rr_wait_io_event(void)
  {
      CPUState *cpu;
      while (all_cpu_threads_idle()) {
 -        stop_tcg_kick_timer();
 +        rr_stop_kick_timer();
          qemu_cond_wait_iothread(first_cpu->halt_cond);
      }
 -    start_tcg_kick_timer();
 +    rr_start_kick_timer();
      CPU_FOREACH(cpu) {
          qemu_wait_io_event_common(cpu);
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_rr_wait_io_event(void)
   * Destroy any remaining vCPUs which have been unplugged and have
   * finished running
   */
 -static void deal_with_unplugged_cpus(void)
 +static void rr_deal_with_unplugged_cpus(void)
  {
      CPUState *cpu;
      CPU_FOREACH(cpu) {
          if (cpu->unplug && !cpu_can_run(cpu)) {
 -            qemu_tcg_destroy_vcpu(cpu);
 +            tcg_cpus_destroy(cpu);
              break;
          }
      }
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
   * elsewhere.
   */
 -static void *tcg_rr_cpu_thread_fn(void *arg)
 +static void *rr_cpu_thread_fn(void *arg)
  {
      CPUState *cpu = arg;
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
          }
      }
 -    start_tcg_kick_timer();
 +    rr_start_kick_timer();
      cpu = first_cpu;
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
               * Run the timers here.  This is much more efficient than
               * waking up the I/O thread and waiting for completion.
               */
 -            handle_icount_deadline();
 +            icount_handle_deadline();
          }
          replay_mutex_unlock();
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
          while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 -            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
 +            qatomic_mb_set(&rr_current_cpu, cpu);
              current_cpu = cpu;
              qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
                  qemu_mutex_unlock_iothread();
                  if (icount_enabled()) {
 -                    prepare_icount_for_run(cpu);
 +                    icount_prepare_for_run(cpu);
                  }
 -                r = tcg_cpu_exec(cpu);
 +                r = tcg_cpus_exec(cpu);
                  if (icount_enabled()) {
 -                    process_icount_data(cpu);
 +                    icount_process_data(cpu);
                  }
                  qemu_mutex_lock_iothread();
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
          } /* while (cpu && !cpu->exit_request).. */
          /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
 -        qatomic_set(&tcg_current_rr_cpu, NULL);
 +        qatomic_set(&rr_current_cpu, NULL);
          if (cpu && cpu->exit_request) {
              qatomic_mb_set(&cpu->exit_request, 0);
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
              qemu_notify_event();
          }
 -        qemu_tcg_rr_wait_io_event();
 -        deal_with_unplugged_cpus();
 +        rr_wait_io_event();
 +        rr_deal_with_unplugged_cpus();
      }
      rcu_unregister_thread();
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
          /* share a single thread for all cpus with TCG */
          snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
          qemu_thread_create(cpu->thread, thread_name,
 -                           tcg_rr_cpu_thread_fn,
 +                           rr_cpu_thread_fn,
                             cpu, QEMU_THREAD_JOINABLE);
          single_tcg_halt_cond = cpu->halt_cond;
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
  const CpusAccel tcg_cpus_rr = {
      .create_vcpu_thread = rr_start_vcpu_thread,
 -    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 +    .kick_vcpu_thread = rr_kick_vcpu_thread,
 -    .handle_interrupt = tcg_handle_interrupt,
 +    .handle_interrupt = tcg_cpus_handle_interrupt,
  };
 diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.c
 +++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
  /* common functionality among all TCG variants */
 -void qemu_tcg_destroy_vcpu(CPUState *cpu)
 +void tcg_cpus_destroy(CPUState *cpu)
  {
      cpu_thread_signal_destroyed(cpu);
  }
 -int tcg_cpu_exec(CPUState *cpu)
 +int tcg_cpus_exec(CPUState *cpu)
  {
      int ret;
  #ifdef CONFIG_PROFILER
@@ -XXX,XX +XXX,XX @@ int tcg_cpu_exec(CPUState *cpu)
  }
  /* mask must never be zero, except for A20 change call */
 -void tcg_handle_interrupt(CPUState *cpu, int mask)
 +void tcg_cpus_handle_interrupt(CPUState *cpu, int mask)
  {
      g_assert(qemu_mutex_iothread_locked());
 --
-.25.1
+.34.1

-New patch
+[PULL v2 15/15] tcg/loongarch64: Reorg goto_tb implementation
+The old implementation replaces two insns, swapping between
+        b       <dest>
+        nop
+and
+        pcaddu18i tmp, <dest>
+        jirl      zero, tmp, <dest> & 0xffff
+There is a race condition in which a thread could be stopped at
+the jirl, i.e. with the top of the address loaded, and when
+restarted we have re-linked to a different TB, so that the top
+half no longer matches the bottom half.
+Note that while we never directly re-link to a different TB, we
+can link, unlink, and link again all while the stopped thread
+remains stopped.
+The new implementation replaces only one insn, swapping between
+        b       <dest>
+and
+        pcadd   tmp, <jmp_addr>
+falling through to load the address from tmp, and branch.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target.h     |  7 +---
+ tcg/loongarch64/tcg-target.c.inc | 72 ++++++++++++++------------------
+files changed, 33 insertions(+), 46 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.h
++++ b/tcg/loongarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@
+ #define TCG_TARGET_INSN_UNIT_SIZE 4
+ #define TCG_TARGET_NB_REGS 32
+-/*
+- * PCADDU18I + JIRL sequence can give 20 + 16 + 2 = 38 bits
+- * signed offset, which is +/- 128 GiB.
+- */
+-#define MAX_CODE_GEN_BUFFER_SIZE  (128 * GiB)
++
++#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
+ typedef enum {
+     TCG_REG_ZERO,
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
+ #endif
+ }
+-/* LoongArch uses `andi zero, zero, 0` as NOP.  */
+-#define NOP OPC_ANDI
+-static void tcg_out_nop(TCGContext *s)
+-{
+-    tcg_out32(s, NOP);
+-}
+-
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+-{
+-    tcg_insn_unit i1, i2;
+-    ptrdiff_t upper, lower;
+-    uintptr_t addr = tb->jmp_target_addr[n];
+-    ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
+-
+-    if (offset == sextreg(offset, 0, 26)) {
+-        i1 = encode_sd10k16_insn(OPC_B, offset);
+-        i2 = NOP;
+-    } else {
+-        tcg_debug_assert(offset == sextreg(offset, 0, 36));
+-        lower = (int16_t)offset;
+-        upper = (offset - lower) >> 16;
+-
+-        i1 = encode_dsj20_insn(OPC_PCADDU18I, TCG_REG_TMP0, upper);
+-        i2 = encode_djsk16_insn(OPC_JIRL, TCG_REG_ZERO, TCG_REG_TMP0, lower);
+-    }
+-    uint64_t pair = ((uint64_t)i2 << 32) | i1;
+-    qatomic_set((uint64_t *)jmp_rw, pair);
+-    flush_idcache_range(jmp_rx, jmp_rw, 8);
+-}
+-
+ /*
+  * Entry-points
+  */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+ static void tcg_out_goto_tb(TCGContext *s, int which)
+ {
+     /*
+-     * Ensure that patch area is 8-byte aligned so that an
+-     * atomic write can be used to patch the target address.
++     * Direct branch, or load indirect address, to be patched
++     * by tb_target_set_jmp_target.  Check indirect load offset
++     * in range early, regardless of direct branch distance,
++     * via assert within tcg_out_opc_pcaddu2i.
+      */
+-    if ((uintptr_t)s->code_ptr & 7) {
+-        tcg_out_nop(s);
+-    }
++    uintptr_t i_addr = get_jmp_target_addr(s, which);
++    intptr_t i_disp = tcg_pcrel_diff(s, (void *)i_addr);
++
+     set_jmp_insn_offset(s, which);
+-    /*
+-     * actual branch destination will be patched by
+-     * tb_target_set_jmp_target later
+-     */
+-    tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
++    tcg_out_opc_pcaddu2i(s, TCG_REG_TMP0, i_disp >> 2);
++
++    /* Finish the load and indirect branch. */
++    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_TMP0, 0);
+     tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+     set_jmp_reset_offset(s, which);
+ }
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
++{
++    uintptr_t d_addr = tb->jmp_target_addr[n];
++    ptrdiff_t d_disp = (ptrdiff_t)(d_addr - jmp_rx) >> 2;
++    tcg_insn_unit insn;
++
++    /* Either directly branch, or load slot address for indirect branch. */
++    if (d_disp == sextreg(d_disp, 0, 26)) {
++        insn = encode_sd10k16_insn(OPC_B, d_disp);
++    } else {
++        uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
++        intptr_t i_disp = i_addr - jmp_rx;
++        insn = encode_dsj20_insn(OPC_PCADDU2I, TCG_REG_TMP0, i_disp >> 2);
++    }
++
++    qatomic_set((tcg_insn_unit *)jmp_rw, insn);
++    flush_idcache_range(jmp_rx, jmp_rw, 4);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+--
+.34.1

The following changes since commit 2ecfc0657afa5d29a373271b342f704a1a3c6737:

Merge remote-tracking branch 'remotes/armbru/tags/pull-misc-2020-12-10' into staging (2020-12-10 17:01:05 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20201210

for you to fetch changes up to 9e2658d62ebc23efe7df43fc0e306f129510d874:

accel/tcg: rename tcg-cpus functions to match module name (2020-12-10 17:44:10 -0600)

----------------------------------------------------------------
Split CpusAccel for tcg variants

----------------------------------------------------------------
Claudio Fontana (3):
      accel/tcg: split CpusAccel into three TCG variants
      accel/tcg: split tcg_start_vcpu_thread
      accel/tcg: rename tcg-cpus functions to match module name

accel/tcg/tcg-cpus-icount.h |  17 ++
 accel/tcg/tcg-cpus-rr.h     |  21 ++
 accel/tcg/tcg-cpus.h        |  12 +-
 accel/tcg/tcg-all.c         |  13 +-
 accel/tcg/tcg-cpus-icount.c | 147 +++++++++++++
 accel/tcg/tcg-cpus-mttcg.c  | 140 ++++++++++++
 accel/tcg/tcg-cpus-rr.c     | 305 ++++++++++++++++++++++++++
 accel/tcg/tcg-cpus.c        | 506 +-------------------------------------------
 softmmu/icount.c            |   2 +-
 accel/tcg/meson.build       |   9 +-
 10 files changed, 670 insertions(+), 502 deletions(-)
 create mode 100644 accel/tcg/tcg-cpus-icount.h
 create mode 100644 accel/tcg/tcg-cpus-rr.h
 create mode 100644 accel/tcg/tcg-cpus-icount.c
 create mode 100644 accel/tcg/tcg-cpus-mttcg.c
 create mode 100644 accel/tcg/tcg-cpus-rr.c

From: Claudio Fontana <cfontana@suse.de>

split up the CpusAccel tcg_cpus into three TCG variants:

tcg_cpus_rr (single threaded, round robin cpus)
tcg_cpus_icount (same as rr, but with instruction counting enabled)
tcg_cpus_mttcg (multi-threaded cpus)

Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20201015143217.29337-2-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-cpus-icount.h |  17 ++
 accel/tcg/tcg-cpus-mttcg.h  |  21 ++
 accel/tcg/tcg-cpus-rr.h     |  20 ++
 accel/tcg/tcg-cpus.h        |  13 +-
 accel/tcg/tcg-all.c         |   8 +-
 accel/tcg/tcg-cpus-icount.c | 147 +++++++++++
 accel/tcg/tcg-cpus-mttcg.c  | 117 +++++++++
 accel/tcg/tcg-cpus-rr.c     | 270 ++++++++++++++++++++
 accel/tcg/tcg-cpus.c        | 484 ++----------------------------------
 softmmu/icount.c            |   2 +-
 accel/tcg/meson.build       |   9 +-
 11 files changed, 646 insertions(+), 462 deletions(-)
 create mode 100644 accel/tcg/tcg-cpus-icount.h
 create mode 100644 accel/tcg/tcg-cpus-mttcg.h
 create mode 100644 accel/tcg/tcg-cpus-rr.h
 create mode 100644 accel/tcg/tcg-cpus-icount.c
 create mode 100644 accel/tcg/tcg-cpus-mttcg.c
 create mode 100644 accel/tcg/tcg-cpus-rr.c

diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-icount.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation using instruction counting
+ *
+ * Copyright 2020 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPUS_ICOUNT_H
+#define TCG_CPUS_ICOUNT_H
+
+void handle_icount_deadline(void);
+void prepare_icount_for_run(CPUState *cpu);
+void process_icount_data(CPUState *cpu);
+
+#endif /* TCG_CPUS_ICOUNT_H */
diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-mttcg.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Multi Threaded vCPUs implementation
+ *
+ * Copyright 2020 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPUS_MTTCG_H
+#define TCG_CPUS_MTTCG_H
+
+/*
+ * In the multi-threaded case each vCPU has its own thread. The TLS
+ * variable current_cpu can be used deep in the code to find the
+ * current CPUState for a given thread.
+ */
+
+void *tcg_cpu_thread_fn(void *arg);
+
+#endif /* TCG_CPUS_MTTCG_H */
diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation
+ *
+ * Copyright 2020 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPUS_RR_H
+#define TCG_CPUS_RR_H
+
+#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+
+/* Kick all RR vCPUs. */
+void qemu_cpu_kick_rr_cpus(CPUState *unused);
+
+void *tcg_rr_cpu_thread_fn(void *arg);
+
+#endif /* TCG_CPUS_RR_H */
diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.h
+++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@
 /*
- * Accelerator CPUS Interface
+ * QEMU TCG vCPU common functionality
+ *
+ * Functionality common to all TCG vcpu variants: mttcg, rr and icount.
  *
  * Copyright 2020 SUSE LLC
  *
@@ -XXX,XX +XXX,XX @@
 
 #include "sysemu/cpus.h"
 
-extern const CpusAccel tcg_cpus;
+extern const CpusAccel tcg_cpus_mttcg;
+extern const CpusAccel tcg_cpus_icount;
+extern const CpusAccel tcg_cpus_rr;
+
+void tcg_start_vcpu_thread(CPUState *cpu);
+void qemu_tcg_destroy_vcpu(CPUState *cpu);
+int tcg_cpu_exec(CPUState *cpu);
+void tcg_handle_interrupt(CPUState *cpu, int mask);
 
 #endif /* TCG_CPUS_H */
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
 
     tcg_exec_init(s->tb_size * 1024 * 1024);
     mttcg_enabled = s->mttcg_enabled;
-    cpus_register_accel(&tcg_cpus);
 
+    if (mttcg_enabled) {
+        cpus_register_accel(&tcg_cpus_mttcg);
+    } else if (icount_enabled()) {
+        cpus_register_accel(&tcg_cpus_icount);
+    } else {
+        cpus_register_accel(&tcg_cpus_rr);
+    }
     return 0;
 }
 
diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation using instruction counting
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/tcg.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
+#include "exec/exec-all.h"
+#include "hw/boards.h"
+
+#include "tcg-cpus.h"
+#include "tcg-cpus-icount.h"
+#include "tcg-cpus-rr.h"
+
+static int64_t tcg_get_icount_limit(void)
+{
+    int64_t deadline;
+
+    if (replay_mode != REPLAY_MODE_PLAY) {
+        /*
+         * Include all the timers, because they may need an attention.
+         * Too long CPU execution may create unnecessary delay in UI.
+         */
+        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                              QEMU_TIMER_ATTR_ALL);
+        /* Check realtime timers, because they help with input processing */
+        deadline = qemu_soonest_timeout(deadline,
+                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
+                                           QEMU_TIMER_ATTR_ALL));
+
+        /*
+         * Maintain prior (possibly buggy) behaviour where if no deadline
+         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
+         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
+         * nanoseconds.
+         */
+        if ((deadline < 0) || (deadline > INT32_MAX)) {
+            deadline = INT32_MAX;
+        }
+
+        return icount_round(deadline);
+    } else {
+        return replay_get_instructions();
+    }
+}
+
+static void notify_aio_contexts(void)
+{
+    /* Wake up other AioContexts.  */
+    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
+}
+
+void handle_icount_deadline(void)
+{
+    assert(qemu_in_vcpu_thread());
+    int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                                  QEMU_TIMER_ATTR_ALL);
+
+    if (deadline == 0) {
+        notify_aio_contexts();
+    }
+}
+
+void prepare_icount_for_run(CPUState *cpu)
+{
+    int insns_left;
+
+    /*
+     * These should always be cleared by process_icount_data after
+     * each vCPU execution. However u16.high can be raised
+     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
+     */
+    g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
+    g_assert(cpu->icount_extra == 0);
+
+    cpu->icount_budget = tcg_get_icount_limit();
+    insns_left = MIN(0xffff, cpu->icount_budget);
+    cpu_neg(cpu)->icount_decr.u16.low = insns_left;
+    cpu->icount_extra = cpu->icount_budget - insns_left;
+
+    replay_mutex_lock();
+
+    if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
+        notify_aio_contexts();
+    }
+}
+
+void process_icount_data(CPUState *cpu)
+{
+    /* Account for executed instructions */
+    icount_update(cpu);
+
+    /* Reset the counters */
+    cpu_neg(cpu)->icount_decr.u16.low = 0;
+    cpu->icount_extra = 0;
+    cpu->icount_budget = 0;
+
+    replay_account_executed_instructions();
+
+    replay_mutex_unlock();
+}
+
+static void icount_handle_interrupt(CPUState *cpu, int mask)
+{
+    int old_mask = cpu->interrupt_request;
+
+    tcg_handle_interrupt(cpu, mask);
+    if (qemu_cpu_is_self(cpu) &&
+        !cpu->can_do_io
+        && (mask & ~old_mask) != 0) {
+        cpu_abort(cpu, "Raised interrupt while not in I/O function");
+    }
+}
+
+const CpusAccel tcg_cpus_icount = {
+    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+
+    .handle_interrupt = icount_handle_interrupt,
+    .get_virtual_clock = icount_get,
+    .get_elapsed_ticks = icount_get,
+};
diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Multi Threaded vCPUs implementation
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/tcg.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
+#include "exec/exec-all.h"
+#include "hw/boards.h"
+
+#include "tcg-cpus.h"
+#include "tcg-cpus-mttcg.h"
+
+/*
+ * In the multi-threaded case each vCPU has its own thread. The TLS
+ * variable current_cpu can be used deep in the code to find the
+ * current CPUState for a given thread.
+ */
+
+void *tcg_cpu_thread_fn(void *arg)
+{
+    CPUState *cpu = arg;
+
+    assert(tcg_enabled());
+    g_assert(!icount_enabled());
+
+    rcu_register_thread();
+    tcg_register_thread();
+
+    qemu_mutex_lock_iothread();
+    qemu_thread_get_self(cpu->thread);
+
+    cpu->thread_id = qemu_get_thread_id();
+    cpu->can_do_io = 1;
+    current_cpu = cpu;
+    cpu_thread_signal_created(cpu);
+    qemu_guest_random_seed_thread_part2(cpu->random_seed);
+
+    /* process any pending work */
+    cpu->exit_request = 1;
+
+    do {
+        if (cpu_can_run(cpu)) {
+            int r;
+            qemu_mutex_unlock_iothread();
+            r = tcg_cpu_exec(cpu);
+            qemu_mutex_lock_iothread();
+            switch (r) {
+            case EXCP_DEBUG:
+                cpu_handle_guest_debug(cpu);
+                break;
+            case EXCP_HALTED:
+                /*
+                 * during start-up the vCPU is reset and the thread is
+                 * kicked several times. If we don't ensure we go back
+                 * to sleep in the halted state we won't cleanly
+                 * start-up when the vCPU is enabled.
+                 *
+                 * cpu->halted should ensure we sleep in wait_io_event
+                 */
+                g_assert(cpu->halted);
+                break;
+            case EXCP_ATOMIC:
+                qemu_mutex_unlock_iothread();
+                cpu_exec_step_atomic(cpu);
+                qemu_mutex_lock_iothread();
+            default:
+                /* Ignore everything else? */
+                break;
+            }
+        }
+
+        qatomic_mb_set(&cpu->exit_request, 0);
+        qemu_wait_io_event(cpu);
+    } while (!cpu->unplug || cpu_can_run(cpu));
+
+    qemu_tcg_destroy_vcpu(cpu);
+    qemu_mutex_unlock_iothread();
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void mttcg_kick_vcpu_thread(CPUState *cpu)
+{
+    cpu_exit(cpu);
+}
+
+const CpusAccel tcg_cpus_mttcg = {
+    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .kick_vcpu_thread = mttcg_kick_vcpu_thread,
+
+    .handle_interrupt = tcg_handle_interrupt,
+};
diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/tcg.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
+#include "exec/exec-all.h"
+#include "hw/boards.h"
+
+#include "tcg-cpus.h"
+#include "tcg-cpus-rr.h"
+#include "tcg-cpus-icount.h"
+
+/* Kick all RR vCPUs */
+void qemu_cpu_kick_rr_cpus(CPUState *unused)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        cpu_exit(cpu);
+    };
+}
+
+/*
+ * TCG vCPU kick timer
+ *
+ * The kick timer is responsible for moving single threaded vCPU
+ * emulation on to the next vCPU. If more than one vCPU is running a
+ * timer event with force a cpu->exit so the next vCPU can get
+ * scheduled.
+ *
+ * The timer is removed if all vCPUs are idle and restarted again once
+ * idleness is complete.
+ */
+
+static QEMUTimer *tcg_kick_vcpu_timer;
+static CPUState *tcg_current_rr_cpu;
+
+#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+
+static inline int64_t qemu_tcg_next_kick(void)
+{
+    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
+}
+
+/* Kick the currently round-robin scheduled vCPU to next */
+static void qemu_cpu_kick_rr_next_cpu(void)
+{
+    CPUState *cpu;
+    do {
+        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
+        if (cpu) {
+            cpu_exit(cpu);
+        }
+    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
+}
+
+static void kick_tcg_thread(void *opaque)
+{
+    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    qemu_cpu_kick_rr_next_cpu();
+}
+
+static void start_tcg_kick_timer(void)
+{
+    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
+        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                                           kick_tcg_thread, NULL);
+    }
+    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
+        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    }
+}
+
+static void stop_tcg_kick_timer(void)
+{
+    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
+        timer_del(tcg_kick_vcpu_timer);
+    }
+}
+
+static void qemu_tcg_rr_wait_io_event(void)
+{
+    CPUState *cpu;
+
+    while (all_cpu_threads_idle()) {
+        stop_tcg_kick_timer();
+        qemu_cond_wait_iothread(first_cpu->halt_cond);
+    }
+
+    start_tcg_kick_timer();
+
+    CPU_FOREACH(cpu) {
+        qemu_wait_io_event_common(cpu);
+    }
+}
+
+/*
+ * Destroy any remaining vCPUs which have been unplugged and have
+ * finished running
+ */
+static void deal_with_unplugged_cpus(void)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        if (cpu->unplug && !cpu_can_run(cpu)) {
+            qemu_tcg_destroy_vcpu(cpu);
+            break;
+        }
+    }
+}
+
+/*
+ * In the single-threaded case each vCPU is simulated in turn. If
+ * there is more than a single vCPU we create a simple timer to kick
+ * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
+ * This is done explicitly rather than relying on side-effects
+ * elsewhere.
+ */
+
+void *tcg_rr_cpu_thread_fn(void *arg)
+{
+    CPUState *cpu = arg;
+
+    assert(tcg_enabled());
+    rcu_register_thread();
+    tcg_register_thread();
+
+    qemu_mutex_lock_iothread();
+    qemu_thread_get_self(cpu->thread);
+
+    cpu->thread_id = qemu_get_thread_id();
+    cpu->can_do_io = 1;
+    cpu_thread_signal_created(cpu);
+    qemu_guest_random_seed_thread_part2(cpu->random_seed);
+
+    /* wait for initial kick-off after machine start */
+    while (first_cpu->stopped) {
+        qemu_cond_wait_iothread(first_cpu->halt_cond);
+
+        /* process any pending work */
+        CPU_FOREACH(cpu) {
+            current_cpu = cpu;
+            qemu_wait_io_event_common(cpu);
+        }
+    }
+
+    start_tcg_kick_timer();
+
+    cpu = first_cpu;
+
+    /* process any pending work */
+    cpu->exit_request = 1;
+
+    while (1) {
+        qemu_mutex_unlock_iothread();
+        replay_mutex_lock();
+        qemu_mutex_lock_iothread();
+
+        if (icount_enabled()) {
+            /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
+            icount_account_warp_timer();
+            /*
+             * Run the timers here.  This is much more efficient than
+             * waking up the I/O thread and waiting for completion.
+             */
+            handle_icount_deadline();
+        }
+
+        replay_mutex_unlock();
+
+        if (!cpu) {
+            cpu = first_cpu;
+        }
+
+        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
+
+            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
+            current_cpu = cpu;
+
+            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
+                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
+
+            if (cpu_can_run(cpu)) {
+                int r;
+
+                qemu_mutex_unlock_iothread();
+                if (icount_enabled()) {
+                    prepare_icount_for_run(cpu);
+                }
+                r = tcg_cpu_exec(cpu);
+                if (icount_enabled()) {
+                    process_icount_data(cpu);
+                }
+                qemu_mutex_lock_iothread();
+
+                if (r == EXCP_DEBUG) {
+                    cpu_handle_guest_debug(cpu);
+                    break;
+                } else if (r == EXCP_ATOMIC) {
+                    qemu_mutex_unlock_iothread();
+                    cpu_exec_step_atomic(cpu);
+                    qemu_mutex_lock_iothread();
+                    break;
+                }
+            } else if (cpu->stop) {
+                if (cpu->unplug) {
+                    cpu = CPU_NEXT(cpu);
+                }
+                break;
+            }
+
+            cpu = CPU_NEXT(cpu);
+        } /* while (cpu && !cpu->exit_request).. */
+
+        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
+        qatomic_set(&tcg_current_rr_cpu, NULL);
+
+        if (cpu && cpu->exit_request) {
+            qatomic_mb_set(&cpu->exit_request, 0);
+        }
+
+        if (icount_enabled() && all_cpu_threads_idle()) {
+            /*
+             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
+             * in the main_loop, wake it up in order to start the warp timer.
+             */
+            qemu_notify_event();
+        }
+
+        qemu_tcg_rr_wait_io_event();
+        deal_with_unplugged_cpus();
+    }
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+const CpusAccel tcg_cpus_rr = {
+    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+
+    .handle_interrupt = tcg_handle_interrupt,
+};
diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.c
+++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
 /*
- * QEMU System Emulator
+ * QEMU TCG vCPU common functionality
+ *
+ * Functionality common to all TCG vCPU variants: mttcg, rr and icount.
  *
  * Copyright (c) 2003-2008 Fabrice Bellard
  * Copyright (c) 2014 Red Hat Inc.
@@ -XXX,XX +XXX,XX @@
 #include "hw/boards.h"
 
 #include "tcg-cpus.h"
+#include "tcg-cpus-mttcg.h"
+#include "tcg-cpus-rr.h"
 
-/* Kick all RR vCPUs */
-static void qemu_cpu_kick_rr_cpus(void)
-{
-    CPUState *cpu;
+/* common functionality among all TCG variants */
 
-    CPU_FOREACH(cpu) {
-        cpu_exit(cpu);
-    };
-}
-
-static void tcg_kick_vcpu_thread(CPUState *cpu)
-{
-    if (qemu_tcg_mttcg_enabled()) {
-        cpu_exit(cpu);
-    } else {
-        qemu_cpu_kick_rr_cpus();
-    }
-}
-
-/*
- * TCG vCPU kick timer
- *
- * The kick timer is responsible for moving single threaded vCPU
- * emulation on to the next vCPU. If more than one vCPU is running a
- * timer event with force a cpu->exit so the next vCPU can get
- * scheduled.
- *
- * The timer is removed if all vCPUs are idle and restarted again once
- * idleness is complete.
- */
-
-static QEMUTimer *tcg_kick_vcpu_timer;
-static CPUState *tcg_current_rr_cpu;
-
-#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
-
-static inline int64_t qemu_tcg_next_kick(void)
-{
-    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
-}
-
-/* Kick the currently round-robin scheduled vCPU to next */
-static void qemu_cpu_kick_rr_next_cpu(void)
-{
-    CPUState *cpu;
-    do {
-        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
-        if (cpu) {
-            cpu_exit(cpu);
-        }
-    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
-}
-
-static void kick_tcg_thread(void *opaque)
-{
-    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-    qemu_cpu_kick_rr_next_cpu();
-}
-
-static void start_tcg_kick_timer(void)
-{
-    assert(!mttcg_enabled);
-    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
-        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
-                                           kick_tcg_thread, NULL);
-    }
-    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
-        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-    }
-}
-
-static void stop_tcg_kick_timer(void)
-{
-    assert(!mttcg_enabled);
-    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
-        timer_del(tcg_kick_vcpu_timer);
-    }
-}
-
-static void qemu_tcg_destroy_vcpu(CPUState *cpu)
-{
-}
-
-static void qemu_tcg_rr_wait_io_event(void)
-{
-    CPUState *cpu;
-
-    while (all_cpu_threads_idle()) {
-        stop_tcg_kick_timer();
-        qemu_cond_wait_iothread(first_cpu->halt_cond);
-    }
-
-    start_tcg_kick_timer();
-
-    CPU_FOREACH(cpu) {
-        qemu_wait_io_event_common(cpu);
-    }
-}
-
-static int64_t tcg_get_icount_limit(void)
-{
-    int64_t deadline;
-
-    if (replay_mode != REPLAY_MODE_PLAY) {
-        /*
-         * Include all the timers, because they may need an attention.
-         * Too long CPU execution may create unnecessary delay in UI.
-         */
-        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
-                                              QEMU_TIMER_ATTR_ALL);
-        /* Check realtime timers, because they help with input processing */
-        deadline = qemu_soonest_timeout(deadline,
-                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
-                                           QEMU_TIMER_ATTR_ALL));
-
-        /*
-         * Maintain prior (possibly buggy) behaviour where if no deadline
-         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
-         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
-         * nanoseconds.
-         */
-        if ((deadline < 0) || (deadline > INT32_MAX)) {
-            deadline = INT32_MAX;
-        }
-
-        return icount_round(deadline);
-    } else {
-        return replay_get_instructions();
-    }
-}
-
-static void notify_aio_contexts(void)
-{
-    /* Wake up other AioContexts.  */
-    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
-    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
-}
-
-static void handle_icount_deadline(void)
-{
-    assert(qemu_in_vcpu_thread());
-    if (icount_enabled()) {
-        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
-                                                      QEMU_TIMER_ATTR_ALL);
-
-        if (deadline == 0) {
-            notify_aio_contexts();
-        }
-    }
-}
-
-static void prepare_icount_for_run(CPUState *cpu)
-{
-    if (icount_enabled()) {
-        int insns_left;
-
-        /*
-         * These should always be cleared by process_icount_data after
-         * each vCPU execution. However u16.high can be raised
-         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
-         */
-        g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
-        g_assert(cpu->icount_extra == 0);
-
-        cpu->icount_budget = tcg_get_icount_limit();
-        insns_left = MIN(0xffff, cpu->icount_budget);
-        cpu_neg(cpu)->icount_decr.u16.low = insns_left;
-        cpu->icount_extra = cpu->icount_budget - insns_left;
-
-        replay_mutex_lock();
-
-        if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
-            notify_aio_contexts();
-        }
-    }
-}
-
-static void process_icount_data(CPUState *cpu)
-{
-    if (icount_enabled()) {
-        /* Account for executed instructions */
-        icount_update(cpu);
-
-        /* Reset the counters */
-        cpu_neg(cpu)->icount_decr.u16.low = 0;
-        cpu->icount_extra = 0;
-        cpu->icount_budget = 0;
-
-        replay_account_executed_instructions();
-
-        replay_mutex_unlock();
-    }
-}
-
-static int tcg_cpu_exec(CPUState *cpu)
-{
-    int ret;
-#ifdef CONFIG_PROFILER
-    int64_t ti;
-#endif
-
-    assert(tcg_enabled());
-#ifdef CONFIG_PROFILER
-    ti = profile_getclock();
-#endif
-    cpu_exec_start(cpu);
-    ret = cpu_exec(cpu);
-    cpu_exec_end(cpu);
-#ifdef CONFIG_PROFILER
-    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
-                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
-#endif
-    return ret;
-}
-
-/*
- * Destroy any remaining vCPUs which have been unplugged and have
- * finished running
- */
-static void deal_with_unplugged_cpus(void)
-{
-    CPUState *cpu;
-
-    CPU_FOREACH(cpu) {
-        if (cpu->unplug && !cpu_can_run(cpu)) {
-            qemu_tcg_destroy_vcpu(cpu);
-            cpu_thread_signal_destroyed(cpu);
-            break;
-        }
-    }
-}
-
-/*
- * Single-threaded TCG
- *
- * In the single-threaded case each vCPU is simulated in turn. If
- * there is more than a single vCPU we create a simple timer to kick
- * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
- * This is done explicitly rather than relying on side-effects
- * elsewhere.
- */
-
-static void *tcg_rr_cpu_thread_fn(void *arg)
-{
-    CPUState *cpu = arg;
-
-    assert(tcg_enabled());
-    rcu_register_thread();
-    tcg_register_thread();
-
-    qemu_mutex_lock_iothread();
-    qemu_thread_get_self(cpu->thread);
-
-    cpu->thread_id = qemu_get_thread_id();
-    cpu->can_do_io = 1;
-    cpu_thread_signal_created(cpu);
-    qemu_guest_random_seed_thread_part2(cpu->random_seed);
-
-    /* wait for initial kick-off after machine start */
-    while (first_cpu->stopped) {
-        qemu_cond_wait_iothread(first_cpu->halt_cond);
-
-        /* process any pending work */
-        CPU_FOREACH(cpu) {
-            current_cpu = cpu;
-            qemu_wait_io_event_common(cpu);
-        }
-    }
-
-    start_tcg_kick_timer();
-
-    cpu = first_cpu;
-
-    /* process any pending work */
-    cpu->exit_request = 1;
-
-    while (1) {
-        qemu_mutex_unlock_iothread();
-        replay_mutex_lock();
-        qemu_mutex_lock_iothread();
-        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
-        icount_account_warp_timer();
-
-        /*
-         * Run the timers here.  This is much more efficient than
-         * waking up the I/O thread and waiting for completion.
-         */
-        handle_icount_deadline();
-
-        replay_mutex_unlock();
-
-        if (!cpu) {
-            cpu = first_cpu;
-        }
-
-        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
-
-            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
-            current_cpu = cpu;
-
-            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
-                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
-
-            if (cpu_can_run(cpu)) {
-                int r;
-
-                qemu_mutex_unlock_iothread();
-                prepare_icount_for_run(cpu);
-
-                r = tcg_cpu_exec(cpu);
-
-                process_icount_data(cpu);
-                qemu_mutex_lock_iothread();
-
-                if (r == EXCP_DEBUG) {
-                    cpu_handle_guest_debug(cpu);
-                    break;
-                } else if (r == EXCP_ATOMIC) {
-                    qemu_mutex_unlock_iothread();
-                    cpu_exec_step_atomic(cpu);
-                    qemu_mutex_lock_iothread();
-                    break;
-                }
-            } else if (cpu->stop) {
-                if (cpu->unplug) {
-                    cpu = CPU_NEXT(cpu);
-                }
-                break;
-            }
-
-            cpu = CPU_NEXT(cpu);
-        } /* while (cpu && !cpu->exit_request).. */
-
-        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
-        qatomic_set(&tcg_current_rr_cpu, NULL);
-
-        if (cpu && cpu->exit_request) {
-            qatomic_mb_set(&cpu->exit_request, 0);
-        }
-
-        if (icount_enabled() && all_cpu_threads_idle()) {
-            /*
-             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
-             * in the main_loop, wake it up in order to start the warp timer.
-             */
-            qemu_notify_event();
-        }
-
-        qemu_tcg_rr_wait_io_event();
-        deal_with_unplugged_cpus();
-    }
-
-    rcu_unregister_thread();
-    return NULL;
-}
-
-/*
- * Multi-threaded TCG
- *
- * In the multi-threaded case each vCPU has its own thread. The TLS
- * variable current_cpu can be used deep in the code to find the
- * current CPUState for a given thread.
- */
-
-static void *tcg_cpu_thread_fn(void *arg)
-{
-    CPUState *cpu = arg;
-
-    assert(tcg_enabled());
-    g_assert(!icount_enabled());
-
-    rcu_register_thread();
-    tcg_register_thread();
-
-    qemu_mutex_lock_iothread();
-    qemu_thread_get_self(cpu->thread);
-
-    cpu->thread_id = qemu_get_thread_id();
-    cpu->can_do_io = 1;
-    current_cpu = cpu;
-    cpu_thread_signal_created(cpu);
-    qemu_guest_random_seed_thread_part2(cpu->random_seed);
-
-    /* process any pending work */
-    cpu->exit_request = 1;
-
-    do {
-        if (cpu_can_run(cpu)) {
-            int r;
-            qemu_mutex_unlock_iothread();
-            r = tcg_cpu_exec(cpu);
-            qemu_mutex_lock_iothread();
-            switch (r) {
-            case EXCP_DEBUG:
-                cpu_handle_guest_debug(cpu);
-                break;
-            case EXCP_HALTED:
-                /*
-                 * during start-up the vCPU is reset and the thread is
-                 * kicked several times. If we don't ensure we go back
-                 * to sleep in the halted state we won't cleanly
-                 * start-up when the vCPU is enabled.
-                 *
-                 * cpu->halted should ensure we sleep in wait_io_event
-                 */
-                g_assert(cpu->halted);
-                break;
-            case EXCP_ATOMIC:
-                qemu_mutex_unlock_iothread();
-                cpu_exec_step_atomic(cpu);
-                qemu_mutex_lock_iothread();
-            default:
-                /* Ignore everything else? */
-                break;
-            }
-        }
-
-        qatomic_mb_set(&cpu->exit_request, 0);
-        qemu_wait_io_event(cpu);
-    } while (!cpu->unplug || cpu_can_run(cpu));
-
-    qemu_tcg_destroy_vcpu(cpu);
-    cpu_thread_signal_destroyed(cpu);
-    qemu_mutex_unlock_iothread();
-    rcu_unregister_thread();
-    return NULL;
-}
-
-static void tcg_start_vcpu_thread(CPUState *cpu)
+void tcg_start_vcpu_thread(CPUState *cpu)
 {
     char thread_name[VCPU_THREAD_NAME_SIZE];
     static QemuCond *single_tcg_halt_cond;
@@ -XXX,XX +XXX,XX @@ static void tcg_start_vcpu_thread(CPUState *cpu)
     }
 }
 
-static int64_t tcg_get_virtual_clock(void)
+void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
-    if (icount_enabled()) {
-        return icount_get();
-    }
-    return cpu_get_clock();
+    cpu_thread_signal_destroyed(cpu);
 }
 
-static int64_t tcg_get_elapsed_ticks(void)
+int tcg_cpu_exec(CPUState *cpu)
 {
-    if (icount_enabled()) {
-        return icount_get();
-    }
-    return cpu_get_ticks();
+    int ret;
+#ifdef CONFIG_PROFILER
+    int64_t ti;
+#endif
+    assert(tcg_enabled());
+#ifdef CONFIG_PROFILER
+    ti = profile_getclock();
+#endif
+    cpu_exec_start(cpu);
+    ret = cpu_exec(cpu);
+    cpu_exec_end(cpu);
+#ifdef CONFIG_PROFILER
+    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
+                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
+#endif
+    return ret;
 }
 
 /* mask must never be zero, except for A20 change call */
-static void tcg_handle_interrupt(CPUState *cpu, int mask)
+void tcg_handle_interrupt(CPUState *cpu, int mask)
 {
-    int old_mask;
     g_assert(qemu_mutex_iothread_locked());
 
-    old_mask = cpu->interrupt_request;
     cpu->interrupt_request |= mask;
 
     /*
@@ -XXX,XX +XXX,XX @@ static void tcg_handle_interrupt(CPUState *cpu, int mask)
         qemu_cpu_kick(cpu);
     } else {
         qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
-        if (icount_enabled() &&
-            !cpu->can_do_io
-            && (mask & ~old_mask) != 0) {
-            cpu_abort(cpu, "Raised interrupt while not in I/O function");
-        }
     }
 }
-
-const CpusAccel tcg_cpus = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
-    .kick_vcpu_thread = tcg_kick_vcpu_thread,
-
-    .handle_interrupt = tcg_handle_interrupt,
-
-    .get_virtual_clock = tcg_get_virtual_clock,
-    .get_elapsed_ticks = tcg_get_elapsed_ticks,
-};
diff --git a/softmmu/icount.c b/softmmu/icount.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/icount.c
+++ b/softmmu/icount.c
@@ -XXX,XX +XXX,XX @@ void icount_start_warp_timer(void)
 
 void icount_account_warp_timer(void)
 {
-    if (!icount_enabled() || !icount_sleep) {
+    if (!icount_sleep) {
         return;
     }
 
diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/meson.build
+++ b/accel/tcg/meson.build
@@ -XXX,XX +XXX,XX @@ tcg_ss.add(when: 'CONFIG_SOFTMMU', if_false: files('user-exec-stub.c'))
 tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c'), libdl])
 specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
 
-specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files('tcg-all.c', 'cputlb.c', 'tcg-cpus.c'))
+specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
+  'tcg-all.c',
+  'cputlb.c',
+  'tcg-cpus.c',
+  'tcg-cpus-mttcg.c',
+  'tcg-cpus-icount.c',
+  'tcg-cpus-rr.c'
+))
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

after the initial split into 3 tcg variants, we proceed to also
split tcg_start_vcpu_thread.

We actually split it in 2 this time, since the icount variant
just uses the round robin function.

Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Message-Id: <20201015143217.29337-3-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-cpus-mttcg.h  | 21 --------------
 accel/tcg/tcg-cpus-rr.h     |  3 +-
 accel/tcg/tcg-cpus.h        |  1 -
 accel/tcg/tcg-all.c         |  5 ++++
 accel/tcg/tcg-cpus-icount.c |  2 +-
 accel/tcg/tcg-cpus-mttcg.c  | 29 +++++++++++++++++--
 accel/tcg/tcg-cpus-rr.c     | 39 +++++++++++++++++++++++--
 accel/tcg/tcg-cpus.c        | 58 -------------------------------------
 8 files changed, 71 insertions(+), 87 deletions(-)
 delete mode 100644 accel/tcg/tcg-cpus-mttcg.h

diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/accel/tcg/tcg-cpus-mttcg.h
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-/*
- * QEMU TCG Multi Threaded vCPUs implementation
- *
- * Copyright 2020 SUSE LLC
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- */
-
-#ifndef TCG_CPUS_MTTCG_H
-#define TCG_CPUS_MTTCG_H
-
-/*
- * In the multi-threaded case each vCPU has its own thread. The TLS
- * variable current_cpu can be used deep in the code to find the
- * current CPUState for a given thread.
- */
-
-void *tcg_cpu_thread_fn(void *arg);
-
-#endif /* TCG_CPUS_MTTCG_H */
diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.h
+++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
 /* Kick all RR vCPUs. */
 void qemu_cpu_kick_rr_cpus(CPUState *unused);
 
-void *tcg_rr_cpu_thread_fn(void *arg);
+/* start the round robin vcpu thread */
+void rr_start_vcpu_thread(CPUState *cpu);
 
 #endif /* TCG_CPUS_RR_H */
diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.h
+++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
 extern const CpusAccel tcg_cpus_icount;
 extern const CpusAccel tcg_cpus_rr;
 
-void tcg_start_vcpu_thread(CPUState *cpu);
 void qemu_tcg_destroy_vcpu(CPUState *cpu);
 int tcg_cpu_exec(CPUState *cpu);
 void tcg_handle_interrupt(CPUState *cpu, int mask);
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
     tcg_exec_init(s->tb_size * 1024 * 1024);
     mttcg_enabled = s->mttcg_enabled;
 
+    /*
+     * Initialize TCG regions
+     */
+    tcg_region_init();
+
     if (mttcg_enabled) {
         cpus_register_accel(&tcg_cpus_mttcg);
     } else if (icount_enabled()) {
diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.c
+++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
 }
 
 const CpusAccel tcg_cpus_icount = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .create_vcpu_thread = rr_start_vcpu_thread,
     .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 
     .handle_interrupt = icount_handle_interrupt,
diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-mttcg.c
+++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/boards.h"
 
 #include "tcg-cpus.h"
-#include "tcg-cpus-mttcg.h"
 
 /*
  * In the multi-threaded case each vCPU has its own thread. The TLS
@@ -XXX,XX +XXX,XX @@
  * current CPUState for a given thread.
  */
 
-void *tcg_cpu_thread_fn(void *arg)
+static void *tcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ static void mttcg_kick_vcpu_thread(CPUState *cpu)
     cpu_exit(cpu);
 }
 
+static void mttcg_start_vcpu_thread(CPUState *cpu)
+{
+    char thread_name[VCPU_THREAD_NAME_SIZE];
+
+    g_assert(tcg_enabled());
+
+    parallel_cpus = (current_machine->smp.max_cpus > 1);
+
+    cpu->thread = g_malloc0(sizeof(QemuThread));
+    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
+    qemu_cond_init(cpu->halt_cond);
+
+    /* create a thread per vCPU with TCG (MTTCG) */
+    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
+             cpu->cpu_index);
+
+    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
+                       cpu, QEMU_THREAD_JOINABLE);
+
+#ifdef _WIN32
+    cpu->hThread = qemu_thread_get_handle(cpu->thread);
+#endif
+}
+
 const CpusAccel tcg_cpus_mttcg = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .create_vcpu_thread = mttcg_start_vcpu_thread,
     .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 
     .handle_interrupt = tcg_handle_interrupt,
diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.c
+++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
  * elsewhere.
  */
 
-void *tcg_rr_cpu_thread_fn(void *arg)
+static void *tcg_rr_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ void *tcg_rr_cpu_thread_fn(void *arg)
     return NULL;
 }
 
+void rr_start_vcpu_thread(CPUState *cpu)
+{
+    char thread_name[VCPU_THREAD_NAME_SIZE];
+    static QemuCond *single_tcg_halt_cond;
+    static QemuThread *single_tcg_cpu_thread;
+
+    g_assert(tcg_enabled());
+    parallel_cpus = false;
+
+    if (!single_tcg_cpu_thread) {
+        cpu->thread = g_malloc0(sizeof(QemuThread));
+        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
+        qemu_cond_init(cpu->halt_cond);
+
+        /* share a single thread for all cpus with TCG */
+        snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
+        qemu_thread_create(cpu->thread, thread_name,
+                           tcg_rr_cpu_thread_fn,
+                           cpu, QEMU_THREAD_JOINABLE);
+
+        single_tcg_halt_cond = cpu->halt_cond;
+        single_tcg_cpu_thread = cpu->thread;
+#ifdef _WIN32
+        cpu->hThread = qemu_thread_get_handle(cpu->thread);
+#endif
+    } else {
+        /* we share the thread */
+        cpu->thread = single_tcg_cpu_thread;
+        cpu->halt_cond = single_tcg_halt_cond;
+        cpu->thread_id = first_cpu->thread_id;
+        cpu->can_do_io = 1;
+        cpu->created = true;
+    }
+}
+
 const CpusAccel tcg_cpus_rr = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .create_vcpu_thread = rr_start_vcpu_thread,
     .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 
     .handle_interrupt = tcg_handle_interrupt,
diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.c
+++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/boards.h"
 
 #include "tcg-cpus.h"
-#include "tcg-cpus-mttcg.h"
-#include "tcg-cpus-rr.h"
 
 /* common functionality among all TCG variants */
 
-void tcg_start_vcpu_thread(CPUState *cpu)
-{
-    char thread_name[VCPU_THREAD_NAME_SIZE];
-    static QemuCond *single_tcg_halt_cond;
-    static QemuThread *single_tcg_cpu_thread;
-    static int tcg_region_inited;
-
-    assert(tcg_enabled());
-    /*
-     * Initialize TCG regions--once. Now is a good time, because:
-     * (1) TCG's init context, prologue and target globals have been set up.
-     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
-     *     -accel flag is processed, so the check doesn't work then).
-     */
-    if (!tcg_region_inited) {
-        tcg_region_inited = 1;
-        tcg_region_init();
-        parallel_cpus = qemu_tcg_mttcg_enabled() && current_machine->smp.max_cpus > 1;
-    }
-
-    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
-        cpu->thread = g_malloc0(sizeof(QemuThread));
-        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
-        qemu_cond_init(cpu->halt_cond);
-
-        if (qemu_tcg_mttcg_enabled()) {
-            /* create a thread per vCPU with TCG (MTTCG) */
-            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
-                 cpu->cpu_index);
-
-            qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
-                               cpu, QEMU_THREAD_JOINABLE);
-
-        } else {
-            /* share a single thread for all cpus with TCG */
-            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
-            qemu_thread_create(cpu->thread, thread_name,
-                               tcg_rr_cpu_thread_fn,
-                               cpu, QEMU_THREAD_JOINABLE);
-
-            single_tcg_halt_cond = cpu->halt_cond;
-            single_tcg_cpu_thread = cpu->thread;
-        }
-#ifdef _WIN32
-        cpu->hThread = qemu_thread_get_handle(cpu->thread);
-#endif
-    } else {
-        /* For non-MTTCG cases we share the thread */
-        cpu->thread = single_tcg_cpu_thread;
-        cpu->halt_cond = single_tcg_halt_cond;
-        cpu->thread_id = first_cpu->thread_id;
-        cpu->can_do_io = 1;
-        cpu->created = true;
-    }
-}
-
 void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
     cpu_thread_signal_destroyed(cpu);
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20201015143217.29337-4-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-cpus-icount.h |  6 +--
 accel/tcg/tcg-cpus-rr.h     |  2 +-
 accel/tcg/tcg-cpus.h        |  6 +--
 accel/tcg/tcg-cpus-icount.c | 24 ++++++------
 accel/tcg/tcg-cpus-mttcg.c  | 10 ++---
 accel/tcg/tcg-cpus-rr.c     | 74 ++++++++++++++++++-------------------
 accel/tcg/tcg-cpus.c        |  6 +--
 7 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.h
+++ b/accel/tcg/tcg-cpus-icount.h
@@ -XXX,XX +XXX,XX @@
 #ifndef TCG_CPUS_ICOUNT_H
 #define TCG_CPUS_ICOUNT_H
 
-void handle_icount_deadline(void);
-void prepare_icount_for_run(CPUState *cpu);
-void process_icount_data(CPUState *cpu);
+void icount_handle_deadline(void);
+void icount_prepare_for_run(CPUState *cpu);
+void icount_process_data(CPUState *cpu);
 
 #endif /* TCG_CPUS_ICOUNT_H */
diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.h
+++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 
 /* Kick all RR vCPUs. */
-void qemu_cpu_kick_rr_cpus(CPUState *unused);
+void rr_kick_vcpu_thread(CPUState *unused);
 
 /* start the round robin vcpu thread */
 void rr_start_vcpu_thread(CPUState *cpu);
diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.h
+++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
 extern const CpusAccel tcg_cpus_icount;
 extern const CpusAccel tcg_cpus_rr;
 
-void qemu_tcg_destroy_vcpu(CPUState *cpu);
-int tcg_cpu_exec(CPUState *cpu);
-void tcg_handle_interrupt(CPUState *cpu, int mask);
+void tcg_cpus_destroy(CPUState *cpu);
+int tcg_cpus_exec(CPUState *cpu);
+void tcg_cpus_handle_interrupt(CPUState *cpu, int mask);
 
 #endif /* TCG_CPUS_H */
diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.c
+++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg-cpus-icount.h"
 #include "tcg-cpus-rr.h"
 
-static int64_t tcg_get_icount_limit(void)
+static int64_t icount_get_limit(void)
 {
     int64_t deadline;
 
@@ -XXX,XX +XXX,XX @@ static int64_t tcg_get_icount_limit(void)
     }
 }
 
-static void notify_aio_contexts(void)
+static void icount_notify_aio_contexts(void)
 {
     /* Wake up other AioContexts.  */
     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
     qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 }
 
-void handle_icount_deadline(void)
+void icount_handle_deadline(void)
 {
     assert(qemu_in_vcpu_thread());
     int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
                                                   QEMU_TIMER_ATTR_ALL);
 
     if (deadline == 0) {
-        notify_aio_contexts();
+        icount_notify_aio_contexts();
     }
 }
 
-void prepare_icount_for_run(CPUState *cpu)
+void icount_prepare_for_run(CPUState *cpu)
 {
     int insns_left;
 
     /*
-     * These should always be cleared by process_icount_data after
+     * These should always be cleared by icount_process_data after
      * each vCPU execution. However u16.high can be raised
-     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
+     * asynchronously by cpu_exit/cpu_interrupt/tcg_cpus_handle_interrupt
      */
     g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
     g_assert(cpu->icount_extra == 0);
 
-    cpu->icount_budget = tcg_get_icount_limit();
+    cpu->icount_budget = icount_get_limit();
     insns_left = MIN(0xffff, cpu->icount_budget);
     cpu_neg(cpu)->icount_decr.u16.low = insns_left;
     cpu->icount_extra = cpu->icount_budget - insns_left;
@@ -XXX,XX +XXX,XX @@ void prepare_icount_for_run(CPUState *cpu)
     replay_mutex_lock();
 
     if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
-        notify_aio_contexts();
+        icount_notify_aio_contexts();
     }
 }
 
-void process_icount_data(CPUState *cpu)
+void icount_process_data(CPUState *cpu)
 {
     /* Account for executed instructions */
     icount_update(cpu);
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
 {
     int old_mask = cpu->interrupt_request;
 
-    tcg_handle_interrupt(cpu, mask);
+    tcg_cpus_handle_interrupt(cpu, mask);
     if (qemu_cpu_is_self(cpu) &&
         !cpu->can_do_io
         && (mask & ~old_mask) != 0) {
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
 
 const CpusAccel tcg_cpus_icount = {
     .create_vcpu_thread = rr_start_vcpu_thread,
-    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+    .kick_vcpu_thread = rr_kick_vcpu_thread,
 
     .handle_interrupt = icount_handle_interrupt,
     .get_virtual_clock = icount_get,
diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-mttcg.c
+++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
  * current CPUState for a given thread.
  */
 
-static void *tcg_cpu_thread_fn(void *arg)
+static void *mttcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
         if (cpu_can_run(cpu)) {
             int r;
             qemu_mutex_unlock_iothread();
-            r = tcg_cpu_exec(cpu);
+            r = tcg_cpus_exec(cpu);
             qemu_mutex_lock_iothread();
             switch (r) {
             case EXCP_DEBUG:
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
         qemu_wait_io_event(cpu);
     } while (!cpu->unplug || cpu_can_run(cpu));
 
-    qemu_tcg_destroy_vcpu(cpu);
+    tcg_cpus_destroy(cpu);
     qemu_mutex_unlock_iothread();
     rcu_unregister_thread();
     return NULL;
@@ -XXX,XX +XXX,XX @@ static void mttcg_start_vcpu_thread(CPUState *cpu)
     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
              cpu->cpu_index);
 
-    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
+    qemu_thread_create(cpu->thread, thread_name, mttcg_cpu_thread_fn,
                        cpu, QEMU_THREAD_JOINABLE);
 
 #ifdef _WIN32
@@ -XXX,XX +XXX,XX @@ const CpusAccel tcg_cpus_mttcg = {
     .create_vcpu_thread = mttcg_start_vcpu_thread,
     .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 
-    .handle_interrupt = tcg_handle_interrupt,
+    .handle_interrupt = tcg_cpus_handle_interrupt,
 };
diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.c
+++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg-cpus-icount.h"
 
 /* Kick all RR vCPUs */
-void qemu_cpu_kick_rr_cpus(CPUState *unused)
+void rr_kick_vcpu_thread(CPUState *unused)
 {
     CPUState *cpu;
 
@@ -XXX,XX +XXX,XX @@ void qemu_cpu_kick_rr_cpus(CPUState *unused)
  * idleness is complete.
  */
 
-static QEMUTimer *tcg_kick_vcpu_timer;
-static CPUState *tcg_current_rr_cpu;
+static QEMUTimer *rr_kick_vcpu_timer;
+static CPUState *rr_current_cpu;
 
 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 
-static inline int64_t qemu_tcg_next_kick(void)
+static inline int64_t rr_next_kick_time(void)
 {
     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 }
 
 /* Kick the currently round-robin scheduled vCPU to next */
-static void qemu_cpu_kick_rr_next_cpu(void)
+static void rr_kick_next_cpu(void)
 {
     CPUState *cpu;
     do {
-        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
+        cpu = qatomic_mb_read(&rr_current_cpu);
         if (cpu) {
             cpu_exit(cpu);
         }
-    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
+    } while (cpu != qatomic_mb_read(&rr_current_cpu));
 }
 
-static void kick_tcg_thread(void *opaque)
+static void rr_kick_thread(void *opaque)
 {
-    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-    qemu_cpu_kick_rr_next_cpu();
+    timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
+    rr_kick_next_cpu();
 }
 
-static void start_tcg_kick_timer(void)
+static void rr_start_kick_timer(void)
 {
-    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
-        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
-                                           kick_tcg_thread, NULL);
+    if (!rr_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
+        rr_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                                           rr_kick_thread, NULL);
     }
-    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
-        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    if (rr_kick_vcpu_timer && !timer_pending(rr_kick_vcpu_timer)) {
+        timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
     }
 }
 
-static void stop_tcg_kick_timer(void)
+static void rr_stop_kick_timer(void)
 {
-    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
-        timer_del(tcg_kick_vcpu_timer);
+    if (rr_kick_vcpu_timer && timer_pending(rr_kick_vcpu_timer)) {
+        timer_del(rr_kick_vcpu_timer);
     }
 }
 
-static void qemu_tcg_rr_wait_io_event(void)
+static void rr_wait_io_event(void)
 {
     CPUState *cpu;
 
     while (all_cpu_threads_idle()) {
-        stop_tcg_kick_timer();
+        rr_stop_kick_timer();
         qemu_cond_wait_iothread(first_cpu->halt_cond);
     }
 
-    start_tcg_kick_timer();
+    rr_start_kick_timer();
 
     CPU_FOREACH(cpu) {
         qemu_wait_io_event_common(cpu);
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_rr_wait_io_event(void)
  * Destroy any remaining vCPUs which have been unplugged and have
  * finished running
  */
-static void deal_with_unplugged_cpus(void)
+static void rr_deal_with_unplugged_cpus(void)
 {
     CPUState *cpu;
 
     CPU_FOREACH(cpu) {
         if (cpu->unplug && !cpu_can_run(cpu)) {
-            qemu_tcg_destroy_vcpu(cpu);
+            tcg_cpus_destroy(cpu);
             break;
         }
     }
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
  * elsewhere.
  */
 
-static void *tcg_rr_cpu_thread_fn(void *arg)
+static void *rr_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
         }
     }
 
-    start_tcg_kick_timer();
+    rr_start_kick_timer();
 
     cpu = first_cpu;
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
              * Run the timers here.  This is much more efficient than
              * waking up the I/O thread and waiting for completion.
              */
-            handle_icount_deadline();
+            icount_handle_deadline();
         }
 
         replay_mutex_unlock();
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
 
         while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 
-            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
+            qatomic_mb_set(&rr_current_cpu, cpu);
             current_cpu = cpu;
 
             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
 
                 qemu_mutex_unlock_iothread();
                 if (icount_enabled()) {
-                    prepare_icount_for_run(cpu);
+                    icount_prepare_for_run(cpu);
                 }
-                r = tcg_cpu_exec(cpu);
+                r = tcg_cpus_exec(cpu);
                 if (icount_enabled()) {
-                    process_icount_data(cpu);
+                    icount_process_data(cpu);
                 }
                 qemu_mutex_lock_iothread();
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
         } /* while (cpu && !cpu->exit_request).. */
 
         /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
-        qatomic_set(&tcg_current_rr_cpu, NULL);
+        qatomic_set(&rr_current_cpu, NULL);
 
         if (cpu && cpu->exit_request) {
             qatomic_mb_set(&cpu->exit_request, 0);
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
             qemu_notify_event();
         }
 
-        qemu_tcg_rr_wait_io_event();
-        deal_with_unplugged_cpus();
+        rr_wait_io_event();
+        rr_deal_with_unplugged_cpus();
     }
 
     rcu_unregister_thread();
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
         /* share a single thread for all cpus with TCG */
         snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
         qemu_thread_create(cpu->thread, thread_name,
-                           tcg_rr_cpu_thread_fn,
+                           rr_cpu_thread_fn,
                            cpu, QEMU_THREAD_JOINABLE);
 
         single_tcg_halt_cond = cpu->halt_cond;
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
 
 const CpusAccel tcg_cpus_rr = {
     .create_vcpu_thread = rr_start_vcpu_thread,
-    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+    .kick_vcpu_thread = rr_kick_vcpu_thread,
 
-    .handle_interrupt = tcg_handle_interrupt,
+    .handle_interrupt = tcg_cpus_handle_interrupt,
 };
diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.c
+++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
 
 /* common functionality among all TCG variants */
 
-void qemu_tcg_destroy_vcpu(CPUState *cpu)
+void tcg_cpus_destroy(CPUState *cpu)
 {
     cpu_thread_signal_destroyed(cpu);
 }
 
-int tcg_cpu_exec(CPUState *cpu)
+int tcg_cpus_exec(CPUState *cpu)
 {
     int ret;
 #ifdef CONFIG_PROFILER
@@ -XXX,XX +XXX,XX @@ int tcg_cpu_exec(CPUState *cpu)
 }
 
 /* mask must never be zero, except for A20 change call */
-void tcg_handle_interrupt(CPUState *cpu, int mask)
+void tcg_cpus_handle_interrupt(CPUState *cpu, int mask)
 {
     g_assert(qemu_mutex_iothread_locked());
 
-- 
2.25.1

Second try's the charm today, right?

The following changes since commit 00b1faea41d283e931256aa78aa975a369ec3ae6:

Merge tag 'pull-target-arm-20230123' of https://git.linaro.org/people/pmaydell/qemu-arm into staging (2023-01-23 13:40:28 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230123

for you to fetch changes up to 709bcd7da3f6b4655d910634a0d520fa1439df38:

tcg/loongarch64: Reorg goto_tb implementation (2023-01-23 16:00:13 -1000)

----------------------------------------------------------------
common-user: Re-enable ppc32 host
tcg: Avoid recursion in tcg_gen_mulu2_i32
tcg: Mark tcg helpers noinline to avoid an issue with LTO
tcg/arm: Use register pair allocation for qemu_{ld,st}_i64
disas: Enable loongarch disassembler, and fixes
tcg/loongarch64: Improve move immediate
tcg/loongarch64: Improve add immediate
tcg/loongarch64: Improve setcond
tcg/loongarch64: Implement movcond
tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
tcg/loongarch64: Reorg goto_tb implementation

----------------------------------------------------------------
Richard Henderson (14):
      tcg: Avoid recursion in tcg_gen_mulu2_i32
      tcg/arm: Use register pair allocation for qemu_{ld,st}_i64
      common-user/host/ppc: Implement safe-syscall.inc.S
      linux-user: Implment host/ppc/host-signal.h
      tcg: Mark tcg helpers noinline to avoid an issue with LTO
      target/loongarch: Enable the disassembler for host tcg
      target/loongarch: Disassemble jirl properly
      target/loongarch: Disassemble pcadd* addresses
      tcg/loongarch64: Update tcg-insn-defs.c.inc
      tcg/loongarch64: Introduce tcg_out_addi
      tcg/loongarch64: Improve setcond expansion
      tcg/loongarch64: Implement movcond
      tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
      tcg/loongarch64: Reorg goto_tb implementation

Rui Wang (1):
      tcg/loongarch64: Optimize immediate loading

include/exec/helper-proto.h                    |  32 ++-
 include/tcg/tcg.h                              |   7 -
 linux-user/include/host/ppc/host-signal.h      |  39 +++
 tcg/arm/tcg-target-con-set.h                   |   7 +-
 tcg/arm/tcg-target-con-str.h                   |   2 +
 tcg/loongarch64/tcg-target-con-set.h           |   5 +-
 tcg/loongarch64/tcg-target-con-str.h           |   2 +-
 tcg/loongarch64/tcg-target.h                   |  11 +-
 target/loongarch/insns.decode                  |   3 +-
 disas.c                                        |   2 +
 target/loongarch/disas.c                       |  39 ++-
 tcg/tcg-op.c                                   |   4 +-
 target/loongarch/insn_trans/trans_branch.c.inc |   2 +-
 tcg/arm/tcg-target.c.inc                       |  28 +-
 tcg/loongarch64/tcg-insn-defs.c.inc            |  10 +-
 tcg/loongarch64/tcg-target.c.inc               | 364 ++++++++++++++++---------
 common-user/host/ppc/safe-syscall.inc.S        | 107 ++++++++
 target/loongarch/meson.build                   |   3 +-
 18 files changed, 497 insertions(+), 170 deletions(-)
 create mode 100644 linux-user/include/host/ppc/host-signal.h
 create mode 100644 common-user/host/ppc/safe-syscall.inc.S

We have a test for one of TCG_TARGET_HAS_mulu2_i32 or
TCG_TARGET_HAS_muluh_i32 being defined, but the test
became non-functional when we changed to always define
all of these macros.

Replace this with a build-time test in tcg_gen_mulu2_i32.

Fixes: 25c4d9cc845 ("tcg: Always define all of the TCGOpcode enum members.")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1435
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h | 7 -------
 tcg/tcg-op.c      | 4 +++-
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef uint64_t TCGRegSet;
 #define TCG_TARGET_HAS_rem_i64          0
 #endif
 
-/* For 32-bit targets, some sort of unsigned widening multiply is required.  */
-#if TCG_TARGET_REG_BITS == 32 \
-    && !(defined(TCG_TARGET_HAS_mulu2_i32) \
-         || defined(TCG_TARGET_HAS_muluh_i32))
-# error "Missing unsigned widening multiply"
-#endif
-
 #if !defined(TCG_TARGET_HAS_v64) \
     && !defined(TCG_TARGET_HAS_v128) \
     && !defined(TCG_TARGET_HAS_v256)
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
         tcg_gen_op3_i32(INDEX_op_muluh_i32, rh, arg1, arg2);
         tcg_gen_mov_i32(rl, t);
         tcg_temp_free_i32(t);
-    } else {
+    } else if (TCG_TARGET_REG_BITS == 64) {
         TCGv_i64 t0 = tcg_temp_new_i64();
         TCGv_i64 t1 = tcg_temp_new_i64();
         tcg_gen_extu_i32_i64(t0, arg1);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
         tcg_gen_extr_i64_i32(rl, rh, t0);
         tcg_temp_free_i64(t0);
         tcg_temp_free_i64(t1);
+    } else {
+        qemu_build_not_reached();
     }
 }
 
-- 
2.34.1

Although we still can't use ldrd and strd for all operations,
increase the chances by getting the register allocation correct.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-con-set.h |  7 ++++---
 tcg/arm/tcg-target-con-str.h |  2 ++
 tcg/arm/tcg-target.c.inc     | 28 ++++++++++++++++++----------
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-set.h
+++ b/tcg/arm/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@ C_O0_I2(r, rIN)
 C_O0_I2(s, s)
 C_O0_I2(w, r)
 C_O0_I3(s, s, s)
+C_O0_I3(S, p, s)
 C_O0_I4(r, r, rI, rI)
-C_O0_I4(s, s, s, s)
+C_O0_I4(S, p, s, s)
 C_O1_I1(r, l)
 C_O1_I1(r, r)
 C_O1_I1(w, r)
@@ -XXX,XX +XXX,XX @@ C_O1_I2(w, w, wZ)
 C_O1_I3(w, w, w, w)
 C_O1_I4(r, r, r, rI, rI)
 C_O1_I4(r, r, rIN, rIK, 0)
-C_O2_I1(r, r, l)
-C_O2_I2(r, r, l, l)
+C_O2_I1(e, p, l)
+C_O2_I2(e, p, l, l)
 C_O2_I2(r, r, r, r)
 C_O2_I4(r, r, r, r, rIN, rIK)
 C_O2_I4(r, r, rI, rI, rIN, rIK)
diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-str.h
+++ b/tcg/arm/tcg-target-con-str.h
@@ -XXX,XX +XXX,XX @@
  * Define constraint letters for register sets:
  * REGS(letter, register_mask)
  */
+REGS('e', ALL_GENERAL_REGS & 0x5555) /* even regs */
 REGS('r', ALL_GENERAL_REGS)
 REGS('l', ALL_QLOAD_REGS)
 REGS('s', ALL_QSTORE_REGS)
+REGS('S', ALL_QSTORE_REGS & 0x5555)  /* even qstore */
 REGS('w', ALL_VECTOR_REGS)
 
 /*
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
         tcg_out_ld32_r(s, COND_AL, datalo, addrlo, addend);
         break;
     case MO_UQ:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* LDRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             /*
              * Rm (the second address op) must not overlap Rt or Rt + 1.
              * Since datalo is aligned, we can simplify the test via alignment.
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
         tcg_out_ld32_12(s, COND_AL, datalo, addrlo, 0);
         break;
     case MO_UQ:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* LDRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             tcg_out_ldrd_8(s, COND_AL, datalo, addrlo, 0);
         } else if (datalo == addrlo) {
             tcg_out_ld32_12(s, COND_AL, datahi, addrlo, 4);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
         tcg_out_st32_r(s, cond, datalo, addrlo, addend);
         break;
     case MO_64:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* STRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             tcg_out_strd_r(s, cond, datalo, addrlo, addend);
         } else if (scratch_addend) {
             tcg_out_st32_rwb(s, cond, datalo, addend, addrlo);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg datalo,
         tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
         break;
     case MO_64:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* STRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             tcg_out_strd_8(s, COND_AL, datalo, addrlo, 0);
         } else {
             tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_qemu_ld_i32:
         return TARGET_LONG_BITS == 32 ? C_O1_I1(r, l) : C_O1_I2(r, l, l);
     case INDEX_op_qemu_ld_i64:
-        return TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, l) : C_O2_I2(r, r, l, l);
+        return TARGET_LONG_BITS == 32 ? C_O2_I1(e, p, l) : C_O2_I2(e, p, l, l);
     case INDEX_op_qemu_st_i32:
         return TARGET_LONG_BITS == 32 ? C_O0_I2(s, s) : C_O0_I3(s, s, s);
     case INDEX_op_qemu_st_i64:
-        return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
+        return TARGET_LONG_BITS == 32 ? C_O0_I3(S, p, s) : C_O0_I4(S, p, s, s);
 
     case INDEX_op_st_vec:
         return C_O0_I2(w, r);
-- 
2.34.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Message-Id: <20220729172141.1789105-2-richard.henderson@linaro.org>
---
 common-user/host/ppc/safe-syscall.inc.S | 107 ++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 common-user/host/ppc/safe-syscall.inc.S

diff --git a/common-user/host/ppc/safe-syscall.inc.S b/common-user/host/ppc/safe-syscall.inc.S
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/common-user/host/ppc/safe-syscall.inc.S
@@ -XXX,XX +XXX,XX @@
+/*
+ * safe-syscall.inc.S : host-specific assembly fragment
+ * to handle signals occurring at the same time as system calls.
+ * This is intended to be included by common-user/safe-syscall.S
+ *
+ * Copyright (C) 2022 Linaro, Ltd.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*
+ * Standardize on the _CALL_FOO symbols used by GCC:
+ * Apple XCode does not define _CALL_DARWIN.
+ * Clang defines _CALL_ELF (64-bit) but not _CALL_SYSV (32-bit).
+ */
+#if !defined(_CALL_SYSV) && \
+    !defined(_CALL_DARWIN) && \
+    !defined(_CALL_AIX) && \
+    !defined(_CALL_ELF)
+# if defined(__APPLE__)
+#  define _CALL_DARWIN
+# elif defined(__ELF__) && TCG_TARGET_REG_BITS == 32
+#  define _CALL_SYSV
+# else
+#  error "Unknown ABI"
+# endif
+#endif 
+
+#ifndef _CALL_SYSV
+# error "Unsupported ABI"
+#endif
+
+
+        .global safe_syscall_base
+        .global safe_syscall_start
+        .global safe_syscall_end
+        .type   safe_syscall_base, @function
+
+        .text
+
+        /*
+         * This is the entry point for making a system call. The calling
+         * convention here is that of a C varargs function with the
+         * first argument an 'int *' to the signal_pending flag, the
+         * second one the system call number (as a 'long'), and all further
+         * arguments being syscall arguments (also 'long').
+         */
+safe_syscall_base:
+        .cfi_startproc
+        stwu    1, -8(1)
+        .cfi_def_cfa_offset 8
+        stw     30, 4(1)
+        .cfi_offset 30, -4
+
+        /*
+         * We enter with r3 == &signal_pending
+         *               r4 == syscall number
+         *               r5 ... r10 == syscall arguments
+         *               and return the result in r3
+         * and the syscall instruction needs
+         *               r0 == syscall number
+         *               r3 ... r8 == syscall arguments
+         *               and returns the result in r3
+         * Shuffle everything around appropriately.
+         */
+        mr      30, 3           /* signal_pending */
+        mr      0, 4            /* syscall number */
+        mr      3, 5            /* syscall arguments */
+        mr      4, 6
+        mr      5, 7
+        mr      6, 8
+        mr      7, 9
+        mr      8, 10
+
+        /*
+         * This next sequence of code works in conjunction with the
+         * rewind_if_safe_syscall_function(). If a signal is taken
+         * and the interrupted PC is anywhere between 'safe_syscall_start'
+         * and 'safe_syscall_end' then we rewind it to 'safe_syscall_start'.
+         * The code sequence must therefore be able to cope with this, and
+         * the syscall instruction must be the final one in the sequence.
+         */
+safe_syscall_start:
+        /* if signal_pending is non-zero, don't do the call */
+        lwz     12, 0(30)
+        cmpwi   0, 12, 0
+        bne-    2f
+        sc
+safe_syscall_end:
+        /* code path when we did execute the syscall */
+        lwz     30, 4(1)        /* restore r30 */
+        addi    1, 1, 8         /* restore stack */
+        .cfi_restore 30
+        .cfi_def_cfa_offset 0
+        bnslr+                  /* return on success */
+        b       safe_syscall_set_errno_tail
+
+        /* code path when we didn't execute the syscall */
+2:      lwz     30, 4(1)
+        addi    1, 1, 8
+        addi    3, 0, QEMU_ERESTARTSYS
+        b       safe_syscall_set_errno_tail
+
+        .cfi_endproc
+
+        .size   safe_syscall_base, .-safe_syscall_base
-- 
2.34.1

This commit re-enables ppc32 as a linux-user host,
as existance of the directory is noted by configure.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1097
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Message-Id: <20220729172141.1789105-3-richard.henderson@linaro.org>
---
 linux-user/include/host/ppc/host-signal.h | 39 +++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 linux-user/include/host/ppc/host-signal.h

diff --git a/linux-user/include/host/ppc/host-signal.h b/linux-user/include/host/ppc/host-signal.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/linux-user/include/host/ppc/host-signal.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * host-signal.h: signal info dependent on the host architecture
+ *
+ * Copyright (c) 2022 Linaro Ltd.
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef PPC_HOST_SIGNAL_H
+#define PPC_HOST_SIGNAL_H
+
+#include <asm/ptrace.h>
+
+/* The third argument to a SA_SIGINFO handler is ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+{
+    return uc->uc_mcontext.regs->nip;
+}
+
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+{
+    uc->uc_mcontext.regs->nip = pc;
+}
+
+static inline void *host_signal_mask(host_sigcontext *uc)
+{
+    return &uc->uc_sigmask;
+}
+
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+{
+    return uc->uc_mcontext.regs->trap != 0x400
+        && (uc->uc_mcontext.regs->dsisr & 0x02000000);
+}
+
+#endif
-- 
2.34.1

Marking helpers __attribute__((noinline)) prevents an issue
with GCC's ipa-split pass under --enable-lto.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1454
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Idan Horowitz <idan.horowitz@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/helper-proto.h | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-proto.h
+++ b/include/exec/helper-proto.h
@@ -XXX,XX +XXX,XX @@
 
 #include "exec/helper-head.h"
 
+/*
+ * Work around an issue with --enable-lto, in which GCC's ipa-split pass
+ * decides to split out the noreturn code paths that raise an exception,
+ * taking the __builtin_return_address() along into the new function,
+ * where it no longer computes a value that returns to TCG generated code.
+ * Despite the name, the noinline attribute affects splitter, so this
+ * prevents the optimization in question.  Given that helpers should not
+ * otherwise be called directly, this should have any other visible effect.
+ *
+ * See https://gitlab.com/qemu-project/qemu/-/issues/1454
+ */
+#define DEF_HELPER_ATTR  __attribute__((noinline))
+
 #define DEF_HELPER_FLAGS_0(name, flags, ret) \
-dh_ctype(ret) HELPER(name) (void);
+dh_ctype(ret) HELPER(name) (void) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_1(name, flags, ret, t1) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1));
+dh_ctype(ret) HELPER(name) (dh_ctype(t1)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2));
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3));
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), \
+                            dh_ctype(t3)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                                   dh_ctype(t4));
+                            dh_ctype(t4)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5) \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                            dh_ctype(t4), dh_ctype(t5));
+                            dh_ctype(t4), dh_ctype(t5)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6) \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6));
+                            dh_ctype(t4), dh_ctype(t5), \
+                            dh_ctype(t6)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
                             dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
-                            dh_ctype(t7));
+                            dh_ctype(t7)) DEF_HELPER_ATTR;
 
 #define IN_HELPER_PROTO
 
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 #undef DEF_HELPER_FLAGS_5
 #undef DEF_HELPER_FLAGS_6
 #undef DEF_HELPER_FLAGS_7
+#undef DEF_HELPER_ATTR
 
 #endif /* HELPER_PROTO_H */
-- 
2.34.1

Reuse the decodetree based disassembler from
target/loongarch/ for tcg/loongarch64/.

The generation of decode-insns.c.inc into ./libcommon.fa.p/ could
eventually result in conflict, if any other host requires the same
trick, but this is good enough for now.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 disas.c                      | 2 ++
 target/loongarch/meson.build | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/disas.c b/disas.c
index XXXXXXX..XXXXXXX 100644
--- a/disas.c
+++ b/disas.c
@@ -XXX,XX +XXX,XX @@ static void initialize_debug_host(CPUDebug *s)
     s->info.cap_insn_split = 6;
 #elif defined(__hppa__)
     s->info.print_insn = print_insn_hppa;
+#elif defined(__loongarch__)
+    s->info.print_insn = print_insn_loongarch;
 #endif
 }
 
diff --git a/target/loongarch/meson.build b/target/loongarch/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/meson.build
+++ b/target/loongarch/meson.build
@@ -XXX,XX +XXX,XX @@ gen = decodetree.process('insns.decode')
 loongarch_ss = ss.source_set()
 loongarch_ss.add(files(
   'cpu.c',
-  'disas.c',
 ))
 loongarch_tcg_ss = ss.source_set()
 loongarch_tcg_ss.add(gen)
@@ -XXX,XX +XXX,XX @@ loongarch_softmmu_ss.add(files(
   'iocsr_helper.c',
 ))
 
+common_ss.add(when: 'CONFIG_LOONGARCH_DIS', if_true: [files('disas.c'), gen])
+
 loongarch_ss.add_all(when: 'CONFIG_TCG', if_true: [loongarch_tcg_ss])
 
 target_arch += {'loongarch': loongarch_ss}
-- 
2.34.1

While jirl shares the same instruction format as bne etc,
it is not assembled the same.  In particular, rd is printed
first not second and the immediate is not pc-relative.

Decode into the arg_rr_i structure, which prints correctly.
This changes the "offs" member to "imm", to update translate.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/loongarch/insns.decode                  | 3 ++-
 target/loongarch/disas.c                       | 2 +-
 target/loongarch/insn_trans/trans_branch.c.inc | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -XXX,XX +XXX,XX @@
 @rr_ui12                 .... ...... imm:12 rj:5 rd:5    &rr_i
 @rr_i14s2         .... ....  .............. rj:5 rd:5    &rr_i imm=%i14s2
 @rr_i16                     .... .. imm:s16 rj:5 rd:5    &rr_i
+@rr_i16s2         .... ..  ................ rj:5 rd:5    &rr_i imm=%offs16
 @hint_r_i12           .... ...... imm:s12 rj:5 hint:5    &hint_r_i
 @rrr_sa2p1        .... ........ ... .. rk:5 rj:5 rd:5    &rrr_sa  sa=%sa2p1
 @rrr_sa2        .... ........ ... sa:2 rk:5 rj:5 rd:5    &rrr_sa
@@ -XXX,XX +XXX,XX @@ beqz            0100 00 ................ ..... .....     @r_offs21
 bnez            0100 01 ................ ..... .....     @r_offs21
 bceqz           0100 10 ................ 00 ... .....    @c_offs21
 bcnez           0100 10 ................ 01 ... .....    @c_offs21
-jirl            0100 11 ................ ..... .....     @rr_offs16
+jirl            0100 11 ................ ..... .....     @rr_i16s2
 b               0101 00 ..........................       @offs26
 bl              0101 01 ..........................       @offs26
 beq             0101 10 ................ ..... .....     @rr_offs16
diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -XXX,XX +XXX,XX @@ INSN(beqz,         r_offs)
 INSN(bnez,         r_offs)
 INSN(bceqz,        c_offs)
 INSN(bcnez,        c_offs)
-INSN(jirl,         rr_offs)
+INSN(jirl,         rr_i)
 INSN(b,            offs)
 INSN(bl,           offs)
 INSN(beq,          rr_offs)
diff --git a/target/loongarch/insn_trans/trans_branch.c.inc b/target/loongarch/insn_trans/trans_branch.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/insn_trans/trans_branch.c.inc
+++ b/target/loongarch/insn_trans/trans_branch.c.inc
@@ -XXX,XX +XXX,XX @@ static bool trans_jirl(DisasContext *ctx, arg_jirl *a)
     TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE);
     TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE);
 
-    tcg_gen_addi_tl(cpu_pc, src1, a->offs);
+    tcg_gen_addi_tl(cpu_pc, src1, a->imm);
     tcg_gen_movi_tl(dest, ctx->base.pc_next + 4);
     gen_set_gpr(a->rd, dest, EXT_NONE);
     tcg_gen_lookup_and_goto_ptr();
-- 
2.34.1

Print both the raw field and the resolved pc-relative
address, as we do for branches.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/loongarch/disas.c | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -XXX,XX +XXX,XX @@ INSN(fsel,         fffc)
 INSN(addu16i_d,    rr_i)
 INSN(lu12i_w,      r_i)
 INSN(lu32i_d,      r_i)
-INSN(pcaddi,       r_i)
-INSN(pcalau12i,    r_i)
-INSN(pcaddu12i,    r_i)
-INSN(pcaddu18i,    r_i)
 INSN(ll_w,         rr_i)
 INSN(sc_w,         rr_i)
 INSN(ll_d,         rr_i)
@@ -XXX,XX +XXX,XX @@ static bool trans_fcmp_cond_##suffix(DisasContext *ctx, \
 
 FCMP_INSN(s)
 FCMP_INSN(d)
+
+#define PCADD_INSN(name)                                        \
+static bool trans_##name(DisasContext *ctx, arg_##name *a)      \
+{                                                               \
+    output(ctx, #name, "r%d, %d # 0x%" PRIx64,                  \
+           a->rd, a->imm, gen_##name(ctx->pc, a->imm));         \
+    return true;                                                \
+}
+
+static uint64_t gen_pcaddi(uint64_t pc, int imm)
+{
+    return pc + (imm << 2);
+}
+
+static uint64_t gen_pcalau12i(uint64_t pc, int imm)
+{
+    return (pc + (imm << 12)) & ~0xfff;
+}
+
+static uint64_t gen_pcaddu12i(uint64_t pc, int imm)
+{
+    return pc + (imm << 12);
+}
+
+static uint64_t gen_pcaddu18i(uint64_t pc, int imm)
+{
+    return pc + ((uint64_t)(imm) << 18);
+}
+
+PCADD_INSN(pcaddi)
+PCADD_INSN(pcalau12i)
+PCADD_INSN(pcaddu12i)
+PCADD_INSN(pcaddu18i)
-- 
2.34.1

From: Rui Wang <wangrui@loongson.cn>

diff:
  Imm                 Before                  After
  0000000000000000    addi.w  rd, zero, 0     addi.w  rd, zero, 0
                      lu52i.d rd, zero, 0
  00000000fffff800    lu12i.w rd, -1          addi.w  rd, zero, -2048
                      ori     rd, rd, 2048    lu32i.d rd, 0
                      lu32i.d rd, 0

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Rui Wang <wangrui@loongson.cn>
Message-Id: <20221107144713.845550-1-wangrui@loongson.cn>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target.c.inc | 35 +++++++++++---------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

Regenerate with ADDU16I included:

$ cd loongarch-opcodes/scripts/go
   $ go run ./genqemutcgdefs > $QEMU/tcg/loongarch64/tcg-insn-defs.c.inc

Reviewed-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-insn-defs.c.inc | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tcg/loongarch64/tcg-insn-defs.c.inc b/tcg/loongarch64/tcg-insn-defs.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-insn-defs.c.inc
+++ b/tcg/loongarch64/tcg-insn-defs.c.inc
@@ -XXX,XX +XXX,XX @@
  *
  * This file is auto-generated by genqemutcgdefs from
  * https://github.com/loongson-community/loongarch-opcodes,
- * from commit 961f0c60f5b63e574d785995600c71ad5413fdc4.
+ * from commit 25ca7effe9d88101c1cf96c4005423643386d81f.
  * DO NOT EDIT.
  */
 
@@ -XXX,XX +XXX,XX @@ typedef enum {
     OPC_ANDI = 0x03400000,
     OPC_ORI = 0x03800000,
     OPC_XORI = 0x03c00000,
+    OPC_ADDU16I_D = 0x10000000,
     OPC_LU12I_W = 0x14000000,
     OPC_CU32I_D = 0x16000000,
     OPC_PCADDU2I = 0x18000000,
@@ -XXX,XX +XXX,XX @@ tcg_out_opc_xori(TCGContext *s, TCGReg d, TCGReg j, uint32_t uk12)
     tcg_out32(s, encode_djuk12_insn(OPC_XORI, d, j, uk12));
 }
 
+/* Emits the `addu16i.d d, j, sk16` instruction.  */
+static void __attribute__((unused))
+tcg_out_opc_addu16i_d(TCGContext *s, TCGReg d, TCGReg j, int32_t sk16)
+{
+    tcg_out32(s, encode_djsk16_insn(OPC_ADDU16I_D, d, j, sk16));
+}
+
 /* Emits the `lu12i.w d, sj20` instruction.  */
 static void __attribute__((unused))
 tcg_out_opc_lu12i_w(TCGContext *s, TCGReg d, int32_t sj20)
-- 
2.34.1

Adjust the constraints to allow any int32_t for immediate
addition.  Split immediate adds into addu16i + addi, which
covers quite a lot of the immediate space.  For the hole in
the middle, load the constant into TMP0 instead.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target-con-set.h |  4 +-
 tcg/loongarch64/tcg-target-con-str.h |  2 +-
 tcg/loongarch64/tcg-target.c.inc     | 57 ++++++++++++++++++++++++----
 3 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target-con-set.h
+++ b/tcg/loongarch64/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@ C_O1_I1(r, L)
 C_O1_I2(r, r, rC)
 C_O1_I2(r, r, ri)
 C_O1_I2(r, r, rI)
+C_O1_I2(r, r, rJ)
 C_O1_I2(r, r, rU)
 C_O1_I2(r, r, rW)
 C_O1_I2(r, r, rZ)
 C_O1_I2(r, 0, rZ)
-C_O1_I2(r, rZ, rN)
+C_O1_I2(r, rZ, ri)
+C_O1_I2(r, rZ, rJ)
 C_O1_I2(r, rZ, rZ)
diff --git a/tcg/loongarch64/tcg-target-con-str.h b/tcg/loongarch64/tcg-target-con-str.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target-con-str.h
+++ b/tcg/loongarch64/tcg-target-con-str.h
@@ -XXX,XX +XXX,XX @@ REGS('L', ALL_GENERAL_REGS & ~SOFTMMU_RESERVE_REGS)
  * CONST(letter, TCG_CT_CONST_* bit set)
  */
 CONST('I', TCG_CT_CONST_S12)
-CONST('N', TCG_CT_CONST_N12)
+CONST('J', TCG_CT_CONST_S32)
 CONST('U', TCG_CT_CONST_U12)
 CONST('Z', TCG_CT_CONST_ZERO)
 CONST('C', TCG_CT_CONST_C12)
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[] = {
 
 #define TCG_CT_CONST_ZERO  0x100
 #define TCG_CT_CONST_S12   0x200
-#define TCG_CT_CONST_N12   0x400
+#define TCG_CT_CONST_S32   0x400
 #define TCG_CT_CONST_U12   0x800
 #define TCG_CT_CONST_C12   0x1000
 #define TCG_CT_CONST_WSZ   0x2000
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
     if ((ct & TCG_CT_CONST_S12) && val == sextreg(val, 0, 12)) {
         return true;
     }
-    if ((ct & TCG_CT_CONST_N12) && -val == sextreg(-val, 0, 12)) {
+    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
         return true;
     }
     if ((ct & TCG_CT_CONST_U12) && val >= 0 && val <= 0xfff) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
     }
 }
 
+static void tcg_out_addi(TCGContext *s, TCGType type, TCGReg rd,
+                         TCGReg rs, tcg_target_long imm)
+{
+    tcg_target_long lo12 = sextreg(imm, 0, 12);
+    tcg_target_long hi16 = sextreg(imm - lo12, 16, 16);
+
+    /*
+     * Note that there's a hole in between hi16 and lo12:
+     *
+     *       3                   2                   1                   0
+     *     1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+     * ...+-------------------------------+-------+-----------------------+
+     *    |             hi16              |       |          lo12         |
+     * ...+-------------------------------+-------+-----------------------+
+     *
+     * For bits within that hole, it's more efficient to use LU12I and ADD.
+     */
+    if (imm == (hi16 << 16) + lo12) {
+        if (hi16) {
+            tcg_out_opc_addu16i_d(s, rd, rs, hi16);
+            rs = rd;
+        }
+        if (type == TCG_TYPE_I32) {
+            tcg_out_opc_addi_w(s, rd, rs, lo12);
+        } else if (lo12) {
+            tcg_out_opc_addi_d(s, rd, rs, lo12);
+        } else {
+            tcg_out_mov(s, type, rd, rs);
+        }
+    } else {
+        tcg_out_movi(s, type, TCG_REG_TMP0, imm);
+        if (type == TCG_TYPE_I32) {
+            tcg_out_opc_add_w(s, rd, rs, TCG_REG_TMP0);
+        } else {
+            tcg_out_opc_add_d(s, rd, rs, TCG_REG_TMP0);
+        }
+    }
+}
+
 static void tcg_out_ext8u(TCGContext *s, TCGReg ret, TCGReg arg)
 {
     tcg_out_opc_andi(s, ret, arg, 0xff);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_add_i32:
         if (c2) {
-            tcg_out_opc_addi_w(s, a0, a1, a2);
+            tcg_out_addi(s, TCG_TYPE_I32, a0, a1, a2);
         } else {
             tcg_out_opc_add_w(s, a0, a1, a2);
         }
         break;
     case INDEX_op_add_i64:
         if (c2) {
-            tcg_out_opc_addi_d(s, a0, a1, a2);
+            tcg_out_addi(s, TCG_TYPE_I64, a0, a1, a2);
         } else {
             tcg_out_opc_add_d(s, a0, a1, a2);
         }
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_sub_i32:
         if (c2) {
-            tcg_out_opc_addi_w(s, a0, a1, -a2);
+            tcg_out_addi(s, TCG_TYPE_I32, a0, a1, -a2);
         } else {
             tcg_out_opc_sub_w(s, a0, a1, a2);
         }
         break;
     case INDEX_op_sub_i64:
         if (c2) {
-            tcg_out_opc_addi_d(s, a0, a1, -a2);
+            tcg_out_addi(s, TCG_TYPE_I64, a0, a1, -a2);
         } else {
             tcg_out_opc_sub_d(s, a0, a1, a2);
         }
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
         return C_O1_I2(r, r, ri);
 
     case INDEX_op_add_i32:
+        return C_O1_I2(r, r, ri);
     case INDEX_op_add_i64:
-        return C_O1_I2(r, r, rI);
+        return C_O1_I2(r, r, rJ);
 
     case INDEX_op_and_i32:
     case INDEX_op_and_i64:
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
         return C_O1_I2(r, 0, rZ);
 
     case INDEX_op_sub_i32:
+        return C_O1_I2(r, rZ, ri);
     case INDEX_op_sub_i64:
-        return C_O1_I2(r, rZ, rN);
+        return C_O1_I2(r, rZ, rJ);
 
     case INDEX_op_mul_i32:
     case INDEX_op_mul_i64:
-- 
2.34.1

Split out a helper function, tcg_out_setcond_int, which
does not always produce the complete boolean result, but
returns a set of flags to do so.

Accept all int32_t as constant input, so that LE/GT can
adjust the constant to LT.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target.c.inc | 165 +++++++++++++++++++++----------
 1 file changed, 115 insertions(+), 50 deletions(-)

diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_clzctz(TCGContext *s, LoongArchInsn opc,
     tcg_out_opc_or(s, a0, TCG_REG_TMP0, a0);
 }
 
-static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
-                            TCGReg arg1, TCGReg arg2, bool c2)
-{
-    TCGReg tmp;
+#define SETCOND_INV    TCG_TARGET_NB_REGS
+#define SETCOND_NEZ    (SETCOND_INV << 1)
+#define SETCOND_FLAGS  (SETCOND_INV | SETCOND_NEZ)
 
-    if (c2) {
-        tcg_debug_assert(arg2 == 0);
+static int tcg_out_setcond_int(TCGContext *s, TCGCond cond, TCGReg ret,
+                               TCGReg arg1, tcg_target_long arg2, bool c2)
+{
+    int flags = 0;
+
+    switch (cond) {
+    case TCG_COND_EQ:    /* -> NE  */
+    case TCG_COND_GE:    /* -> LT  */
+    case TCG_COND_GEU:   /* -> LTU */
+    case TCG_COND_GT:    /* -> LE  */
+    case TCG_COND_GTU:   /* -> LEU */
+        cond = tcg_invert_cond(cond);
+        flags ^= SETCOND_INV;
+        break;
+    default:
+        break;
     }
 
     switch (cond) {
-    case TCG_COND_EQ:
-        if (c2) {
-            tmp = arg1;
-        } else {
-            tcg_out_opc_sub_d(s, ret, arg1, arg2);
-            tmp = ret;
-        }
-        tcg_out_opc_sltui(s, ret, tmp, 1);
-        break;
-    case TCG_COND_NE:
-        if (c2) {
-            tmp = arg1;
-        } else {
-            tcg_out_opc_sub_d(s, ret, arg1, arg2);
-            tmp = ret;
-        }
-        tcg_out_opc_sltu(s, ret, TCG_REG_ZERO, tmp);
-        break;
-    case TCG_COND_LT:
-        tcg_out_opc_slt(s, ret, arg1, arg2);
-        break;
-    case TCG_COND_GE:
-        tcg_out_opc_slt(s, ret, arg1, arg2);
-        tcg_out_opc_xori(s, ret, ret, 1);
-        break;
     case TCG_COND_LE:
-        tcg_out_setcond(s, TCG_COND_GE, ret, arg2, arg1, false);
-        break;
-    case TCG_COND_GT:
-        tcg_out_setcond(s, TCG_COND_LT, ret, arg2, arg1, false);
-        break;
-    case TCG_COND_LTU:
-        tcg_out_opc_sltu(s, ret, arg1, arg2);
-        break;
-    case TCG_COND_GEU:
-        tcg_out_opc_sltu(s, ret, arg1, arg2);
-        tcg_out_opc_xori(s, ret, ret, 1);
-        break;
     case TCG_COND_LEU:
-        tcg_out_setcond(s, TCG_COND_GEU, ret, arg2, arg1, false);
+        /*
+         * If we have a constant input, the most efficient way to implement
+         * LE is by adding 1 and using LT.  Watch out for wrap around for LEU.
+         * We don't need to care for this for LE because the constant input
+         * is still constrained to int32_t, and INT32_MAX+1 is representable
+         * in the 64-bit temporary register.
+         */
+        if (c2) {
+            if (cond == TCG_COND_LEU) {
+                /* unsigned <= -1 is true */
+                if (arg2 == -1) {
+                    tcg_out_movi(s, TCG_TYPE_REG, ret, !(flags & SETCOND_INV));
+                    return ret;
+                }
+                cond = TCG_COND_LTU;
+            } else {
+                cond = TCG_COND_LT;
+            }
+            arg2 += 1;
+        } else {
+            TCGReg tmp = arg2;
+            arg2 = arg1;
+            arg1 = tmp;
+            cond = tcg_swap_cond(cond);    /* LE -> GE */
+            cond = tcg_invert_cond(cond);  /* GE -> LT */
+            flags ^= SETCOND_INV;
+        }
         break;
-    case TCG_COND_GTU:
-        tcg_out_setcond(s, TCG_COND_LTU, ret, arg2, arg1, false);
+    default:
         break;
+    }
+
+    switch (cond) {
+    case TCG_COND_NE:
+        flags |= SETCOND_NEZ;
+        if (!c2) {
+            tcg_out_opc_xor(s, ret, arg1, arg2);
+        } else if (arg2 == 0) {
+            ret = arg1;
+        } else if (arg2 >= 0 && arg2 <= 0xfff) {
+            tcg_out_opc_xori(s, ret, arg1, arg2);
+        } else {
+            tcg_out_addi(s, TCG_TYPE_REG, ret, arg1, -arg2);
+        }
+        break;
+
+    case TCG_COND_LT:
+    case TCG_COND_LTU:
+        if (c2) {
+            if (arg2 >= -0x800 && arg2 <= 0x7ff) {
+                if (cond == TCG_COND_LT) {
+                    tcg_out_opc_slti(s, ret, arg1, arg2);
+                } else {
+                    tcg_out_opc_sltui(s, ret, arg1, arg2);
+                }
+                break;
+            }
+            tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_TMP0, arg2);
+            arg2 = TCG_REG_TMP0;
+        }
+        if (cond == TCG_COND_LT) {
+            tcg_out_opc_slt(s, ret, arg1, arg2);
+        } else {
+            tcg_out_opc_sltu(s, ret, arg1, arg2);
+        }
+        break;
+
     default:
         g_assert_not_reached();
         break;
     }
+
+    return ret | flags;
+}
+
+static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
+                            TCGReg arg1, tcg_target_long arg2, bool c2)
+{
+    int tmpflags = tcg_out_setcond_int(s, cond, ret, arg1, arg2, c2);
+
+    if (tmpflags != ret) {
+        TCGReg tmp = tmpflags & ~SETCOND_FLAGS;
+
+        switch (tmpflags & SETCOND_FLAGS) {
+        case SETCOND_INV:
+            /* Intermediate result is boolean: simply invert. */
+            tcg_out_opc_xori(s, ret, tmp, 1);
+            break;
+        case SETCOND_NEZ:
+            /* Intermediate result is zero/non-zero: test != 0. */
+            tcg_out_opc_sltu(s, ret, TCG_REG_ZERO, tmp);
+            break;
+        case SETCOND_NEZ | SETCOND_INV:
+            /* Intermediate result is zero/non-zero: test == 0. */
+            tcg_out_opc_sltui(s, ret, tmp, 1);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    }
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_ctz_i64:
         return C_O1_I2(r, r, rW);
 
-    case INDEX_op_setcond_i32:
-    case INDEX_op_setcond_i64:
-        return C_O1_I2(r, r, rZ);
-
     case INDEX_op_deposit_i32:
     case INDEX_op_deposit_i64:
         /* Must deposit into the same register as input */
         return C_O1_I2(r, 0, rZ);
 
     case INDEX_op_sub_i32:
+    case INDEX_op_setcond_i32:
         return C_O1_I2(r, rZ, ri);
     case INDEX_op_sub_i64:
+    case INDEX_op_setcond_i64:
         return C_O1_I2(r, rZ, rJ);
 
     case INDEX_op_mul_i32:
-- 
2.34.1

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target-con-set.h |  1 +
 tcg/loongarch64/tcg-target.h         |  4 ++--
 tcg/loongarch64/tcg-target.c.inc     | 33 ++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 2 deletions(-)

The old implementation replaces two insns, swapping between

b       <dest>
        nop
and
        pcaddu18i tmp, <dest>
        jirl      zero, tmp, <dest> & 0xffff

There is a race condition in which a thread could be stopped at
the jirl, i.e. with the top of the address loaded, and when
restarted we have re-linked to a different TB, so that the top
half no longer matches the bottom half.

Note that while we never directly re-link to a different TB, we
can link, unlink, and link again all while the stopped thread
remains stopped.

The new implementation replaces only one insn, swapping between

b       <dest>
and
        pcadd   tmp, <jmp_addr>

falling through to load the address from tmp, and branch.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target.h     |  7 +---
 tcg/loongarch64/tcg-target.c.inc | 72 ++++++++++++++------------------
 2 files changed, 33 insertions(+), 46 deletions(-)

diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_NB_REGS 32
-/*
- * PCADDU18I + JIRL sequence can give 20 + 16 + 2 = 38 bits
- * signed offset, which is +/- 128 GiB.
- */
-#define MAX_CODE_GEN_BUFFER_SIZE  (128 * GiB)
+
+#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 
 typedef enum {
     TCG_REG_ZERO,
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
 #endif
 }
 
-/* LoongArch uses `andi zero, zero, 0` as NOP.  */
-#define NOP OPC_ANDI
-static void tcg_out_nop(TCGContext *s)
-{
-    tcg_out32(s, NOP);
-}
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-{
-    tcg_insn_unit i1, i2;
-    ptrdiff_t upper, lower;
-    uintptr_t addr = tb->jmp_target_addr[n];
-    ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
-
-    if (offset == sextreg(offset, 0, 26)) {
-        i1 = encode_sd10k16_insn(OPC_B, offset);
-        i2 = NOP;
-    } else {
-        tcg_debug_assert(offset == sextreg(offset, 0, 36));
-        lower = (int16_t)offset;
-        upper = (offset - lower) >> 16;
-
-        i1 = encode_dsj20_insn(OPC_PCADDU18I, TCG_REG_TMP0, upper);
-        i2 = encode_djsk16_insn(OPC_JIRL, TCG_REG_ZERO, TCG_REG_TMP0, lower);
-    }
-    uint64_t pair = ((uint64_t)i2 << 32) | i1;
-    qatomic_set((uint64_t *)jmp_rw, pair);
-    flush_idcache_range(jmp_rx, jmp_rw, 8);
-}
-
 /*
  * Entry-points
  */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
     /*
-     * Ensure that patch area is 8-byte aligned so that an
-     * atomic write can be used to patch the target address.
+     * Direct branch, or load indirect address, to be patched
+     * by tb_target_set_jmp_target.  Check indirect load offset
+     * in range early, regardless of direct branch distance,
+     * via assert within tcg_out_opc_pcaddu2i.
      */
-    if ((uintptr_t)s->code_ptr & 7) {
-        tcg_out_nop(s);
-    }
+    uintptr_t i_addr = get_jmp_target_addr(s, which);
+    intptr_t i_disp = tcg_pcrel_diff(s, (void *)i_addr);
+
     set_jmp_insn_offset(s, which);
-    /*
-     * actual branch destination will be patched by
-     * tb_target_set_jmp_target later
-     */
-    tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
+    tcg_out_opc_pcaddu2i(s, TCG_REG_TMP0, i_disp >> 2);
+
+    /* Finish the load and indirect branch. */
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_TMP0, 0);
     tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    uintptr_t d_addr = tb->jmp_target_addr[n];
+    ptrdiff_t d_disp = (ptrdiff_t)(d_addr - jmp_rx) >> 2;
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or load slot address for indirect branch. */
+    if (d_disp == sextreg(d_disp, 0, 26)) {
+        insn = encode_sd10k16_insn(OPC_B, d_disp);
+    } else {
+        uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
+        intptr_t i_disp = i_addr - jmp_rx;
+        insn = encode_dsj20_insn(OPC_PCADDU2I, TCG_REG_TMP0, i_disp >> 2);
+    }
+
+    qatomic_set((tcg_insn_unit *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
-- 
2.34.1