Series comparison

-[PULL 0/3] tcg patch queue
+[PULL 00/22] tcg patch queue
-The following changes since commit 2ecfc0657afa5d29a373271b342f704a1a3c6737:
+Second pull for this week, since this set is large enough by itself.
-  Merge remote-tracking branch 'remotes/armbru/tags/pull-misc-2020-12-10' into staging (2020-12-10 17:01:05 +0000)
 r~
 The following changes since commit 7c9236d6d61f30583d5d860097d88dbf0fe487bf:
   Merge tag 'pull-tcg-20230116' of https://gitlab.com/rth7680/qemu into staging (2023-01-17 10:24:16 +0000)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20201210
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230117
-for you to fetch changes up to 9e2658d62ebc23efe7df43fc0e306f129510d874:
+for you to fetch changes up to 493c9b19a7fb7f387c4fcf57d3836504d5242bf5:
-  accel/tcg: rename tcg-cpus functions to match module name (2020-12-10 17:44:10 -0600)
+  tcg/riscv: Implement direct branch for goto_tb (2023-01-17 22:36:17 +0000)
 ----------------------------------------------------------------
-Split CpusAccel for tcg variants
+tcg: Fix race conditions in (most) goto_tb implementations
 ----------------------------------------------------------------
-Claudio Fontana (3):
+Richard Henderson (22):
-      accel/tcg: split CpusAccel into three TCG variants
+      tcg: Split out tcg_out_exit_tb
-      accel/tcg: split tcg_start_vcpu_thread
+      tcg/i386: Remove unused goto_tb code for indirect jump
-      accel/tcg: rename tcg-cpus functions to match module name
+      tcg/ppc: Remove unused goto_tb code for indirect jump
       tcg/sparc64: Remove unused goto_tb code for indirect jump
       tcg: Replace asserts on tcg_jmp_insn_offset
       tcg: Introduce set_jmp_insn_offset
       tcg: Introduce get_jmp_target_addr
       tcg: Split out tcg_out_goto_tb
       tcg: Rename TB_JMP_RESET_OFFSET_INVALID to TB_JMP_OFFSET_INVALID
       tcg: Add gen_tb to TCGContext
       tcg: Add TranslationBlock.jmp_insn_offset
       tcg: Change tb_target_set_jmp_target arguments
       tcg: Move tb_target_set_jmp_target declaration to tcg.h
       tcg: Always define tb_target_set_jmp_target
       tcg: Remove TCG_TARGET_HAS_direct_jump
       tcg/aarch64: Reorg goto_tb implementation
       tcg/ppc: Reorg goto_tb implementation
       tcg/sparc64: Remove USE_REG_TB
       tcg/sparc64: Reorg goto_tb implementation
       tcg/arm: Implement direct branch for goto_tb
       tcg/riscv: Introduce OPC_NOP
       tcg/riscv: Implement direct branch for goto_tb
- accel/tcg/tcg-cpus-icount.h |  17 ++
+ include/exec/exec-all.h          |   5 +-
- accel/tcg/tcg-cpus-rr.h     |  21 ++
+ include/tcg/tcg.h                |  14 ++-
- accel/tcg/tcg-cpus.h        |  12 +-
+ tcg/aarch64/tcg-target.h         |   6 +-
- accel/tcg/tcg-all.c         |  13 +-
+ tcg/arm/tcg-target.h             |   5 -
- accel/tcg/tcg-cpus-icount.c | 147 +++++++++++++
+ tcg/i386/tcg-target.h            |   9 --
- accel/tcg/tcg-cpus-mttcg.c  | 140 ++++++++++++
+ tcg/loongarch64/tcg-target.h     |   3 -
- accel/tcg/tcg-cpus-rr.c     | 305 ++++++++++++++++++++++++++
+ tcg/mips/tcg-target.h            |   5 -
- accel/tcg/tcg-cpus.c        | 506 +-------------------------------------------
+ tcg/ppc/tcg-target.h             |   7 +-
- softmmu/icount.c            |   2 +-
+ tcg/riscv/tcg-target.h           |   4 -
- accel/tcg/meson.build       |   9 +-
+ tcg/s390x/tcg-target.h           |  11 ---
-files changed, 670 insertions(+), 502 deletions(-)
+ tcg/sparc64/tcg-target.h         |   4 -
- create mode 100644 accel/tcg/tcg-cpus-icount.h
+ tcg/tci/tcg-target.h             |   4 -
- create mode 100644 accel/tcg/tcg-cpus-rr.h
+ accel/tcg/cpu-exec.c             |  21 ++--
- create mode 100644 accel/tcg/tcg-cpus-icount.c
+ accel/tcg/translate-all.c        |  10 +-
- create mode 100644 accel/tcg/tcg-cpus-mttcg.c
+ tcg/tcg-op.c                     |  14 +--
- create mode 100644 accel/tcg/tcg-cpus-rr.c
+ tcg/tcg.c                        |  42 +++++---
+ tcg/aarch64/tcg-target.c.inc     | 106 ++++++++++-----------
  tcg/arm/tcg-target.c.inc         |  89 +++++++++++------
  tcg/i386/tcg-target.c.inc        |  68 +++++++------
  tcg/loongarch64/tcg-target.c.inc |  66 +++++++------
  tcg/mips/tcg-target.c.inc        |  59 +++++++-----
  tcg/ppc/tcg-target.c.inc         | 193 ++++++++++++-------------------------
  tcg/riscv/tcg-target.c.inc       |  65 +++++++++----
  tcg/s390x/tcg-target.c.inc       |  67 ++++++++-----
  tcg/sparc64/tcg-target.c.inc     | 201 +++++++++++++++------------------------
  tcg/tci/tcg-target.c.inc         |  31 +++---
 files changed, 528 insertions(+), 581 deletions(-)

-New patch
+[PULL 01/22] tcg: Split out tcg_out_exit_tb
+The INDEX_op_exit_tb opcode needs no register allocation.
+Split out a dedicated helper function for it.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tcg.c                        |  4 ++++
+ tcg/aarch64/tcg-target.c.inc     | 22 ++++++++++--------
+ tcg/arm/tcg-target.c.inc         | 11 +++++----
+ tcg/i386/tcg-target.c.inc        | 21 +++++++++--------
+ tcg/loongarch64/tcg-target.c.inc | 22 ++++++++++--------
+ tcg/mips/tcg-target.c.inc        | 33 +++++++++++++--------------
+ tcg/ppc/tcg-target.c.inc         | 11 +++++----
+ tcg/riscv/tcg-target.c.inc       | 22 ++++++++++--------
+ tcg/s390x/tcg-target.c.inc       | 23 ++++++++++---------
+ tcg/sparc64/tcg-target.c.inc     | 39 +++++++++++++++++---------------
+ tcg/tci/tcg-target.c.inc         | 10 ++++----
+files changed, 121 insertions(+), 97 deletions(-)
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
+ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
+ static void tcg_out_movi(TCGContext *s, TCGType type,
+                          TCGReg ret, tcg_target_long arg);
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS]);
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
+         case INDEX_op_call:
+             tcg_reg_alloc_call(s, op);
+             break;
++        case INDEX_op_exit_tb:
++            tcg_out_exit_tb(s, op->args[0]);
++            break;
+         case INDEX_op_dup2_vec:
+             if (tcg_reg_alloc_dup2(s, op)) {
+                 break;
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/aarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
+ static const tcg_insn_unit *tb_ret_addr;
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    /* Reuse the zeroing that exists for goto_ptr.  */
++    if (a0 == 0) {
++        tcg_out_goto_long(s, tcg_code_gen_epilogue);
++    } else {
++        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
++        tcg_out_goto_long(s, tb_ret_addr);
++    }
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+ #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        /* Reuse the zeroing that exists for goto_ptr.  */
+-        if (a0 == 0) {
+-            tcg_out_goto_long(s, tcg_code_gen_epilogue);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
+-            tcg_out_goto_long(s, tb_ret_addr);
+-        }
+-        break;
+-
+     case INDEX_op_goto_tb:
+         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+         /*
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         g_assert_not_reached();
+     }
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+ static void tcg_out_epilogue(TCGContext *s);
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
++{
++    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, arg);
++    tcg_out_epilogue(s);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     int c;
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]);
+-        tcg_out_epilogue(s);
+-        break;
+     case INDEX_op_goto_tb:
+         {
+             /* Indirect jump method */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.c.inc
++++ b/tcg/i386/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+ #endif
+ }
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    /* Reuse the zeroing that exists for goto_ptr.  */
++    if (a0 == 0) {
++        tcg_out_jmp(s, tcg_code_gen_epilogue);
++    } else {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
++        tcg_out_jmp(s, tb_ret_addr);
++    }
++}
++
+ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                               const TCGArg args[TCG_MAX_OP_ARGS],
+                               const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     const_a2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        /* Reuse the zeroing that exists for goto_ptr.  */
+-        if (a0 == 0) {
+-            tcg_out_jmp(s, tcg_code_gen_epilogue);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
+-            tcg_out_jmp(s, tb_ret_addr);
+-        }
+-        break;
+     case INDEX_op_goto_tb:
+         if (s->tb_jmp_insn_offset) {
+             /* direct jump method */
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+ static const tcg_insn_unit *tb_ret_addr;
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    /* Reuse the zeroing that exists for goto_ptr.  */
++    if (a0 == 0) {
++        tcg_out_call_int(s, tcg_code_gen_epilogue, true);
++    } else {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
++        tcg_out_call_int(s, tb_ret_addr, true);
++    }
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     int c2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        /* Reuse the zeroing that exists for goto_ptr.  */
+-        if (a0 == 0) {
+-            tcg_out_call_int(s, tcg_code_gen_epilogue, true);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
+-            tcg_out_call_int(s, tb_ret_addr, true);
+-        }
+-        break;
+-
+     case INDEX_op_goto_tb:
+         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+         /*
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         g_assert_not_reached();
+     }
+diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/mips/tcg-target.c.inc
++++ b/tcg/mips/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_clz(TCGContext *s, MIPSInsn opcv2, MIPSInsn opcv6,
+     }
+ }
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    TCGReg b0 = TCG_REG_ZERO;
++
++    if (a0 & ~0xffff) {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, a0 & ~0xffff);
++        b0 = TCG_REG_V0;
++    }
++    if (!tcg_out_opc_jmp(s, OPC_J, tb_ret_addr)) {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, (uintptr_t)tb_ret_addr);
++        tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
++    }
++    tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     c2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        {
+-            TCGReg b0 = TCG_REG_ZERO;
+-
+-            a0 = (intptr_t)a0;
+-            if (a0 & ~0xffff) {
+-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, a0 & ~0xffff);
+-                b0 = TCG_REG_V0;
+-            }
+-            if (!tcg_out_opc_jmp(s, OPC_J, tb_ret_addr)) {
+-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0,
+-                             (uintptr_t)tb_ret_addr);
+-                tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
+-            }
+-            tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
+-        }
+-        break;
+     case INDEX_op_goto_tb:
+         /* indirect jump method */
+         tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.c.inc
++++ b/tcg/ppc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
+     tcg_out32(s, BCLR | BO_ALWAYS);
+ }
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
++{
++    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, arg);
++    tcg_out_b(s, 0, tcg_code_gen_epilogue);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     TCGArg a0, a1, a2;
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, args[0]);
+-        tcg_out_b(s, 0, tcg_code_gen_epilogue);
+-        break;
+     case INDEX_op_goto_tb:
+         if (s->tb_jmp_insn_offset) {
+             /* Direct jump. */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:   /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:   /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/riscv/tcg-target.c.inc
++++ b/tcg/riscv/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
+ static const tcg_insn_unit *tb_ret_addr;
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    /* Reuse the zeroing that exists for goto_ptr.  */
++    if (a0 == 0) {
++        tcg_out_call_int(s, tcg_code_gen_epilogue, true);
++    } else {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
++        tcg_out_call_int(s, tb_ret_addr, true);
++    }
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     int c2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        /* Reuse the zeroing that exists for goto_ptr.  */
+-        if (a0 == 0) {
+-            tcg_out_call_int(s, tcg_code_gen_epilogue, true);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
+-            tcg_out_call_int(s, tb_ret_addr, true);
+-        }
+-        break;
+-
+     case INDEX_op_goto_tb:
+         assert(s->tb_jmp_insn_offset == 0);
+         /* indirect jump method */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         g_assert_not_reached();
+     }
+diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/s390x/tcg-target.c.inc
++++ b/tcg/s390x/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
+ #endif
+ }
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    /* Reuse the zeroing that exists for goto_ptr.  */
++    if (a0 == 0) {
++        tgen_gotoi(s, S390_CC_ALWAYS, tcg_code_gen_epilogue);
++    } else {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, a0);
++        tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
++    }
++}
++
+ # define OP_32_64(x) \
+         case glue(glue(INDEX_op_,x),_i32): \
+         case glue(glue(INDEX_op_,x),_i64)
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     TCGArg a0, a1, a2;
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        /* Reuse the zeroing that exists for goto_ptr.  */
+-        a0 = args[0];
+-        if (a0 == 0) {
+-            tgen_gotoi(s, S390_CC_ALWAYS, tcg_code_gen_epilogue);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, a0);
+-            tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
+-        }
+-        break;
+-
+     case INDEX_op_goto_tb:
+         a0 = args[0];
+         /*
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc64/tcg-target.c.inc
++++ b/tcg/sparc64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
+ #endif /* CONFIG_SOFTMMU */
+ }
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    if (check_fit_ptr(a0, 13)) {
++        tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
++        tcg_out_movi_imm13(s, TCG_REG_O0, a0);
++        return;
++    } else if (USE_REG_TB) {
++        intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
++        if (check_fit_ptr(tb_diff, 13)) {
++            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
++            /* Note that TCG_REG_TB has been unwound to O1.  */
++            tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O1, tb_diff, ARITH_ADD);
++            return;
++        }
++    }
++    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I0, a0 & ~0x3ff);
++    tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
++    tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     c2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        if (check_fit_ptr(a0, 13)) {
+-            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+-            tcg_out_movi_imm13(s, TCG_REG_O0, a0);
+-            break;
+-        } else if (USE_REG_TB) {
+-            intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
+-            if (check_fit_ptr(tb_diff, 13)) {
+-                tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+-                /* Note that TCG_REG_TB has been unwound to O1.  */
+-                tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O1, tb_diff, ARITH_ADD);
+-                break;
+-            }
+-        }
+-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I0, a0 & ~0x3ff);
+-        tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+-        tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
+-        break;
+     case INDEX_op_goto_tb:
+         if (s->tb_jmp_insn_offset) {
+             /* direct jump method */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.c.inc
++++ b/tcg/tci/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *func,
+ # define CASE_64(x)
+ #endif
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
++{
++    tcg_out_op_p(s, INDEX_op_exit_tb, (void *)arg);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     TCGOpcode exts;
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        tcg_out_op_p(s, opc, (void *)args[0]);
+-        break;
+-
+     case INDEX_op_goto_tb:
+         tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+         /* indirect jump method. */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+--
+.34.1

-New patch
+[PULL 02/22] tcg/i386: Remove unused goto_tb code for indirect jump
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/i386/tcg-target.c.inc | 14 +++++---------
+file changed, 5 insertions(+), 9 deletions(-)
+diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.c.inc
++++ b/tcg/i386/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        if (s->tb_jmp_insn_offset) {
+-            /* direct jump method */
+-            int gap;
+-            /* jump displacement must be aligned for atomic patching;
++        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
++        {
++            /*
++             * Jump displacement must be aligned for atomic patching;
+              * see if we need to add extra nops before jump
+              */
+-            gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
++            int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
+             if (gap != 1) {
+                 tcg_out_nopn(s, gap - 1);
+             }
+             tcg_out8(s, OPC_JMP_long); /* jmp im */
+             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+             tcg_out32(s, 0);
+-        } else {
+-            /* indirect jump method */
+-            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
+-                                 (intptr_t)(s->tb_jmp_target_addr + a0));
+         }
+         set_jmp_reset_offset(s, a0);
+         break;
+--
+.34.1

-New patch
+[PULL 03/22] tcg/ppc: Remove unused goto_tb code for indirect jump
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/ppc/tcg-target.c.inc | 32 +++++++++++++-------------------
+file changed, 13 insertions(+), 19 deletions(-)
+diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.c.inc
++++ b/tcg/ppc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        if (s->tb_jmp_insn_offset) {
+-            /* Direct jump. */
+-            if (TCG_TARGET_REG_BITS == 64) {
+-                /* Ensure the next insns are 8 or 16-byte aligned. */
+-                while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
+-                    tcg_out32(s, NOP);
+-                }
+-                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+-                tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+-                tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+-            } else {
+-                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+-                tcg_out32(s, B);
+-                s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+-                break;
++        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
++        /* Direct jump. */
++        if (TCG_TARGET_REG_BITS == 64) {
++            /* Ensure the next insns are 8 or 16-byte aligned. */
++            while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
++                tcg_out32(s, NOP);
+             }
++            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
++            tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
++            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+         } else {
+-            /* Indirect jump. */
+-            tcg_debug_assert(s->tb_jmp_insn_offset == NULL);
+-            tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, 0,
+-                       (intptr_t)(s->tb_jmp_insn_offset + args[0]));
++            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
++            tcg_out32(s, B);
++            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
++            break;
+         }
+         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
+         tcg_out32(s, BCCTR | BO_ALWAYS);
+--
+.34.1

-New patch
+[PULL 04/22] tcg/sparc64: Remove unused goto_tb code for indirect jump
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/sparc64/tcg-target.c.inc | 41 +++++++++++-------------------------
+file changed, 12 insertions(+), 29 deletions(-)
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc64/tcg-target.c.inc
++++ b/tcg/sparc64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
+     return false;
+ }
+-static void tcg_out_ld_ptr(TCGContext *s, TCGReg ret, const void *arg)
+-{
+-    intptr_t diff = tcg_tbrel_diff(s, arg);
+-    if (USE_REG_TB && check_fit_ptr(diff, 13)) {
+-        tcg_out_ld(s, TCG_TYPE_PTR, ret, TCG_REG_TB, diff);
+-        return;
+-    }
+-    tcg_out_movi(s, TCG_TYPE_PTR, ret, (uintptr_t)arg & ~0x3ff);
+-    tcg_out_ld(s, TCG_TYPE_PTR, ret, ret, (uintptr_t)arg & 0x3ff);
+-}
+-
+ static void tcg_out_sety(TCGContext *s, TCGReg rs)
+ {
+     tcg_out32(s, WRY | INSN_RS1(TCG_REG_G0) | INSN_RS2(rs));
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        if (s->tb_jmp_insn_offset) {
+-            /* direct jump method */
+-            if (USE_REG_TB) {
+-                /* make sure the patch is 8-byte aligned.  */
+-                if ((intptr_t)s->code_ptr & 4) {
+-                    tcg_out_nop(s);
+-                }
+-                s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+-                tcg_out_sethi(s, TCG_REG_T1, 0);
+-                tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+-                tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+-                tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+-            } else {
+-                s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+-                tcg_out32(s, CALL);
++        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
++        /* Direct jump. */
++        if (USE_REG_TB) {
++            /* make sure the patch is 8-byte aligned.  */
++            if ((intptr_t)s->code_ptr & 4) {
+                 tcg_out_nop(s);
+             }
++            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
++            tcg_out_sethi(s, TCG_REG_T1, 0);
++            tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
++            tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
++            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+         } else {
+-            /* indirect jump method */
+-            tcg_out_ld_ptr(s, TCG_REG_TB, s->tb_jmp_target_addr + a0);
+-            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_TB, 0, JMPL);
++            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
++            tcg_out32(s, CALL);
+             tcg_out_nop(s);
+         }
+         set_jmp_reset_offset(s, a0);
+--
+.34.1

-New patch
+[PULL 05/22] tcg: Replace asserts on tcg_jmp_insn_offset
+Test TCG_TARGET_HAS_direct_jump instead of testing an
+implementation pointer.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/aarch64/tcg-target.c.inc     | 2 +-
+ tcg/arm/tcg-target.c.inc         | 2 +-
+ tcg/loongarch64/tcg-target.c.inc | 2 +-
+ tcg/mips/tcg-target.c.inc        | 2 +-
+ tcg/riscv/tcg-target.c.inc       | 2 +-
+ tcg/tci/tcg-target.c.inc         | 2 +-
+files changed, 6 insertions(+), 6 deletions(-)
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/aarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
++        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+         /*
+          * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
+          * write can be used to patch the target address.
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+             intptr_t ptr, dif, dil;
+             TCGReg base = TCG_REG_PC;
+-            tcg_debug_assert(s->tb_jmp_insn_offset == 0);
++            qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+             ptr = (intptr_t)tcg_splitwx_to_rx(s->tb_jmp_target_addr + args[0]);
+             dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
+             dil = sextract32(dif, 0, 12);
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
++        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+         /*
+          * Ensure that patch area is 8-byte aligned so that an
+          * atomic write can be used to patch the target address.
+diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/mips/tcg-target.c.inc
++++ b/tcg/mips/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+         /* indirect jump method */
+-        tcg_debug_assert(s->tb_jmp_insn_offset == 0);
++        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+         tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
+                    (uintptr_t)(s->tb_jmp_target_addr + a0));
+         tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/riscv/tcg-target.c.inc
++++ b/tcg/riscv/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        assert(s->tb_jmp_insn_offset == 0);
++        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+         /* indirect jump method */
+         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
+                    (uintptr_t)(s->tb_jmp_target_addr + a0));
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.c.inc
++++ b/tcg/tci/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        tcg_debug_assert(s->tb_jmp_insn_offset == 0);
++        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+         /* indirect jump method. */
+         tcg_out_op_p(s, opc, s->tb_jmp_target_addr + args[0]);
+         set_jmp_reset_offset(s, args[0]);
+--
+.34.1

-New patch
+[PULL 06/22] tcg: Introduce set_jmp_insn_offset
+Similar to the existing set_jmp_reset_offset.  Move any assert for
+TCG_TARGET_HAS_direct_jump into the new function (which now cannot
+be build-time).  Will be unused if TCG_TARGET_HAS_direct_jump is
+constant 0, but we can't test for constant in the preprocessor,
+so just mark it G_GNUC_UNUSED.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tcg.c                        | 10 ++++++++++
+ tcg/aarch64/tcg-target.c.inc     |  3 +--
+ tcg/i386/tcg-target.c.inc        |  3 +--
+ tcg/loongarch64/tcg-target.c.inc |  3 +--
+ tcg/ppc/tcg-target.c.inc         |  7 +++----
+ tcg/s390x/tcg-target.c.inc       |  2 +-
+ tcg/sparc64/tcg-target.c.inc     |  5 ++---
+files changed, 19 insertions(+), 14 deletions(-)
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
+     s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
+ }
++static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
++{
++    /*
++     * We will check for overflow at the end of the opcode loop in
++     * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
++     */
++    tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
++    s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
++}
++
+ /* Signal overflow, starting over with fewer guest insns. */
+ static G_NORETURN
+ void tcg_raise_tb_overflow(TCGContext *s)
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/aarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+         /*
+          * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
+          * write can be used to patch the target address.
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+         if ((uintptr_t)s->code_ptr & 7) {
+             tcg_out32(s, NOP);
+         }
+-        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
++        set_jmp_insn_offset(s, a0);
+         /*
+          * actual branch destination will be patched by
+          * tb_target_set_jmp_target later
+diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.c.inc
++++ b/tcg/i386/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+         {
+             /*
+              * Jump displacement must be aligned for atomic patching;
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                 tcg_out_nopn(s, gap - 1);
+             }
+             tcg_out8(s, OPC_JMP_long); /* jmp im */
+-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
++            set_jmp_insn_offset(s, a0);
+             tcg_out32(s, 0);
+         }
+         set_jmp_reset_offset(s, a0);
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+         /*
+          * Ensure that patch area is 8-byte aligned so that an
+          * atomic write can be used to patch the target address.
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+         if ((uintptr_t)s->code_ptr & 7) {
+             tcg_out_nop(s);
+         }
+-        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
++        set_jmp_insn_offset(s, a0);
+         /*
+          * actual branch destination will be patched by
+          * tb_target_set_jmp_target later
+diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.c.inc
++++ b/tcg/ppc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+         /* Direct jump. */
+         if (TCG_TARGET_REG_BITS == 64) {
+             /* Ensure the next insns are 8 or 16-byte aligned. */
+             while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
+                 tcg_out32(s, NOP);
+             }
+-            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
++            set_jmp_insn_offset(s, args[0]);
+             tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+             tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+         } else {
+-            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
++            set_jmp_insn_offset(s, args[0]);
+             tcg_out32(s, B);
+-            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
++            set_jmp_reset_offset(s, args[0]);
+             break;
+         }
+         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
+diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/s390x/tcg-target.c.inc
++++ b/tcg/s390x/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+             tcg_out16(s, NOP);
+         }
+         tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
+-        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
++        set_jmp_insn_offset(s, a0);
+         s->code_ptr += 2;
+         set_jmp_reset_offset(s, a0);
+         break;
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc64/tcg-target.c.inc
++++ b/tcg/sparc64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+         /* Direct jump. */
+         if (USE_REG_TB) {
+             /* make sure the patch is 8-byte aligned.  */
+             if ((intptr_t)s->code_ptr & 4) {
+                 tcg_out_nop(s);
+             }
+-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
++            set_jmp_insn_offset(s, a0);
+             tcg_out_sethi(s, TCG_REG_T1, 0);
+             tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+             tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+             tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+         } else {
+-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
++            set_jmp_insn_offset(s, a0);
+             tcg_out32(s, CALL);
+             tcg_out_nop(s);
+         }
+--
+.34.1

-New patch
+[PULL 07/22] tcg: Introduce get_jmp_target_addr
+Similar to the existing set_jmp_reset_offset.  Include the
+rw->rx address space conversion done by arm and s390x, and
+forgotten by mips and riscv.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tcg.c                  | 9 +++++++++
+ tcg/arm/tcg-target.c.inc   | 2 +-
+ tcg/mips/tcg-target.c.inc  | 2 +-
+ tcg/riscv/tcg-target.c.inc | 2 +-
+ tcg/tci/tcg-target.c.inc   | 2 +-
+files changed, 13 insertions(+), 4 deletions(-)
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
+     s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
+ }
++static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
++{
++    /*
++     * Return the read-execute version of the pointer, for the benefit
++     * of any pc-relative addressing mode.
++     */
++    return (uintptr_t)tcg_splitwx_to_rx(&s->tb_jmp_target_addr[which]);
++}
++
+ /* Signal overflow, starting over with fewer guest insns. */
+ static G_NORETURN
+ void tcg_raise_tb_overflow(TCGContext *s)
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+             TCGReg base = TCG_REG_PC;
+             qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+-            ptr = (intptr_t)tcg_splitwx_to_rx(s->tb_jmp_target_addr + args[0]);
++            ptr = get_jmp_target_addr(s, args[0]);
+             dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
+             dil = sextract32(dif, 0, 12);
+             if (dif != dil) {
+diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/mips/tcg-target.c.inc
++++ b/tcg/mips/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+         /* indirect jump method */
+         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+         tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
+-                   (uintptr_t)(s->tb_jmp_target_addr + a0));
++                   get_jmp_target_addr(s, a0));
+         tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
+         tcg_out_nop(s);
+         set_jmp_reset_offset(s, a0);
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/riscv/tcg-target.c.inc
++++ b/tcg/riscv/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+         /* indirect jump method */
+         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
+-                   (uintptr_t)(s->tb_jmp_target_addr + a0));
++                   get_jmp_target_addr(s, a0));
+         tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+         set_jmp_reset_offset(s, a0);
+         break;
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.c.inc
++++ b/tcg/tci/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_goto_tb:
+         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+         /* indirect jump method. */
+-        tcg_out_op_p(s, opc, s->tb_jmp_target_addr + args[0]);
++        tcg_out_op_p(s, opc, (void *)get_jmp_target_addr(s, args[0]));
+         set_jmp_reset_offset(s, args[0]);
+         break;
+--
+.34.1

-New patch
+[PULL 08/22] tcg: Split out tcg_out_goto_tb
+The INDEX_op_goto_tb opcode needs no register allocation.
+Split out a dedicated helper function for it.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tcg.c                        |  4 ++
+ tcg/aarch64/tcg-target.c.inc     | 40 ++++++++++---------
+ tcg/arm/tcg-target.c.inc         | 49 ++++++++++++-----------
+ tcg/i386/tcg-target.c.inc        | 33 ++++++++--------
+ tcg/loongarch64/tcg-target.c.inc | 38 +++++++++---------
+ tcg/mips/tcg-target.c.inc        | 21 +++++-----
+ tcg/ppc/tcg-target.c.inc         | 52 ++++++++++++------------
+ tcg/riscv/tcg-target.c.inc       | 20 +++++-----
+ tcg/s390x/tcg-target.c.inc       | 31 ++++++++-------
+ tcg/sparc64/tcg-target.c.inc     | 68 +++++++++++++++++---------------
+ tcg/tci/tcg-target.c.inc         | 16 ++++----
+files changed, 199 insertions(+), 173 deletions(-)
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
+ static void tcg_out_movi(TCGContext *s, TCGType type,
+                          TCGReg ret, tcg_target_long arg);
+ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
++static void tcg_out_goto_tb(TCGContext *s, int which);
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS]);
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
+         case INDEX_op_exit_tb:
+             tcg_out_exit_tb(s, op->args[0]);
+             break;
++        case INDEX_op_goto_tb:
++            tcg_out_goto_tb(s, op->args[0]);
++            break;
+         case INDEX_op_dup2_vec:
+             if (tcg_reg_alloc_dup2(s, op)) {
+                 break;
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/aarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+     }
+ }
++static void tcg_out_goto_tb(TCGContext *s, int which)
++{
++    /*
++     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
++     * write can be used to patch the target address.
++     */
++    if ((uintptr_t)s->code_ptr & 7) {
++        tcg_out32(s, NOP);
++    }
++    set_jmp_insn_offset(s, which);
++    /*
++     * actual branch destination will be patched by
++     * tb_target_set_jmp_target later
++     */
++    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
++    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
++    tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
++    set_jmp_reset_offset(s, which);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+ #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
+     switch (opc) {
+-    case INDEX_op_goto_tb:
+-        /*
+-         * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
+-         * write can be used to patch the target address.
+-         */
+-        if ((uintptr_t)s->code_ptr & 7) {
+-            tcg_out32(s, NOP);
+-        }
+-        set_jmp_insn_offset(s, a0);
+-        /*
+-         * actual branch destination will be patched by
+-         * tb_target_set_jmp_target later
+-         */
+-        tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
+-        tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
+-        tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
+-        set_jmp_reset_offset(s, a0);
+-        break;
+-
+     case INDEX_op_goto_ptr:
+         tcg_out_insn(s, 3207, BR, a0);
+         break;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
++    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
+     default:
+         g_assert_not_reached();
+     }
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
+     tcg_out_epilogue(s);
+ }
++static void tcg_out_goto_tb(TCGContext *s, int which)
++{
++    /* Indirect jump method */
++    intptr_t ptr, dif, dil;
++    TCGReg base = TCG_REG_PC;
++
++    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
++    ptr = get_jmp_target_addr(s, which);
++    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
++    dil = sextract32(dif, 0, 12);
++    if (dif != dil) {
++        /*
++         * The TB is close, but outside the 12 bits addressable by
++         * the load.  We can extend this to 20 bits with a sub of a
++         * shifted immediate from pc.  In the vastly unlikely event
++         * the code requires more than 1MB, we'll use 2 insns and
++         * be no worse off.
++         */
++        base = TCG_REG_R0;
++        tcg_out_movi32(s, COND_AL, base, ptr - dil);
++    }
++    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
++    set_jmp_reset_offset(s, which);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     int c;
+     switch (opc) {
+-    case INDEX_op_goto_tb:
+-        {
+-            /* Indirect jump method */
+-            intptr_t ptr, dif, dil;
+-            TCGReg base = TCG_REG_PC;
+-
+-            qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+-            ptr = get_jmp_target_addr(s, args[0]);
+-            dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
+-            dil = sextract32(dif, 0, 12);
+-            if (dif != dil) {
+-                /* The TB is close, but outside the 12 bits addressable by
+-                   the load.  We can extend this to 20 bits with a sub of a
+-                   shifted immediate from pc.  In the vastly unlikely event
+-                   the code requires more than 1MB, we'll use 2 insns and
+-                   be no worse off.  */
+-                base = TCG_REG_R0;
+-                tcg_out_movi32(s, COND_AL, base, ptr - dil);
+-            }
+-            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
+-            set_jmp_reset_offset(s, args[0]);
+-        }
+-        break;
+     case INDEX_op_goto_ptr:
+         tcg_out_b_reg(s, COND_AL, args[0]);
+         break;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
++    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.c.inc
++++ b/tcg/i386/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+     }
+ }
++static void tcg_out_goto_tb(TCGContext *s, int which)
++{
++    /*
++     * Jump displacement must be aligned for atomic patching;
++     * see if we need to add extra nops before jump
++     */
++    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
++    if (gap != 1) {
++        tcg_out_nopn(s, gap - 1);
++    }
++    tcg_out8(s, OPC_JMP_long); /* jmp im */
++    set_jmp_insn_offset(s, which);
++    tcg_out32(s, 0);
++    set_jmp_reset_offset(s, which);
++}
++
+ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                               const TCGArg args[TCG_MAX_OP_ARGS],
+                               const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     const_a2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_goto_tb:
+-        {
+-            /*
+-             * Jump displacement must be aligned for atomic patching;
+-             * see if we need to add extra nops before jump
+-             */
+-            int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
+-            if (gap != 1) {
+-                tcg_out_nopn(s, gap - 1);
+-            }
+-            tcg_out8(s, OPC_JMP_long); /* jmp im */
+-            set_jmp_insn_offset(s, a0);
+-            tcg_out32(s, 0);
+-        }
+-        set_jmp_reset_offset(s, a0);
+-        break;
+     case INDEX_op_goto_ptr:
+         /* jmp to the given host address (could be epilogue) */
+         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
++    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+     }
+ }
++static void tcg_out_goto_tb(TCGContext *s, int which)
++{
++    /*
++     * Ensure that patch area is 8-byte aligned so that an
++     * atomic write can be used to patch the target address.
++     */
++    if ((uintptr_t)s->code_ptr & 7) {
++        tcg_out_nop(s);
++    }
++    set_jmp_insn_offset(s, which);
++    /*
++     * actual branch destination will be patched by
++     * tb_target_set_jmp_target later
++     */
++    tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
++    tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
++    set_jmp_reset_offset(s, which);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     int c2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_goto_tb:
+-        /*
+-         * Ensure that patch area is 8-byte aligned so that an
+-         * atomic write can be used to patch the target address.
+-         */
+-        if ((uintptr_t)s->code_ptr & 7) {
+-            tcg_out_nop(s);
+-        }
+-        set_jmp_insn_offset(s, a0);
+-        /*
+-         * actual branch destination will be patched by
+-         * tb_target_set_jmp_target later
+-         */
+-        tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
+-        tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+-        set_jmp_reset_offset(s, a0);
+-        break;
+-
+     case INDEX_op_mb:
+         tcg_out_mb(s, a0);
+         break;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
++    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
+     default:
+         g_assert_not_reached();
+     }
+diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/mips/tcg-target.c.inc
++++ b/tcg/mips/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+     tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
+ }
++static void tcg_out_goto_tb(TCGContext *s, int which)
++{
++    /* indirect jump method */
++    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
++    tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
++               get_jmp_target_addr(s, which));
++    tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
++    tcg_out_nop(s);
++    set_jmp_reset_offset(s, which);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     c2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_goto_tb:
+-        /* indirect jump method */
+-        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
+-                   get_jmp_target_addr(s, a0));
+-        tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
+-        tcg_out_nop(s);
+-        set_jmp_reset_offset(s, a0);
+-        break;
+     case INDEX_op_goto_ptr:
+         /* jmp to the given host address (could be epilogue) */
+         tcg_out_opc_reg(s, OPC_JR, 0, a0, 0);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
++    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.c.inc
++++ b/tcg/ppc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
+     tcg_out_b(s, 0, tcg_code_gen_epilogue);
+ }
++static void tcg_out_goto_tb(TCGContext *s, int which)
++{
++    /* Direct jump. */
++    if (TCG_TARGET_REG_BITS == 64) {
++        /* Ensure the next insns are 8 or 16-byte aligned. */
++        while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
++            tcg_out32(s, NOP);
++        }
++        set_jmp_insn_offset(s, which);
++        tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
++        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
++        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
++        tcg_out32(s, BCCTR | BO_ALWAYS);
++        set_jmp_reset_offset(s, which);
++        if (USE_REG_TB) {
++            /* For the unlinked case, need to reset TCG_REG_TB.  */
++            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
++                             -tcg_current_code_size(s));
++        }
++    } else {
++        set_jmp_insn_offset(s, which);
++        tcg_out32(s, B);
++        set_jmp_reset_offset(s, which);
++    }
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     TCGArg a0, a1, a2;
+     switch (opc) {
+-    case INDEX_op_goto_tb:
+-        /* Direct jump. */
+-        if (TCG_TARGET_REG_BITS == 64) {
+-            /* Ensure the next insns are 8 or 16-byte aligned. */
+-            while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
+-                tcg_out32(s, NOP);
+-            }
+-            set_jmp_insn_offset(s, args[0]);
+-            tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+-            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+-        } else {
+-            set_jmp_insn_offset(s, args[0]);
+-            tcg_out32(s, B);
+-            set_jmp_reset_offset(s, args[0]);
+-            break;
+-        }
+-        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
+-        tcg_out32(s, BCCTR | BO_ALWAYS);
+-        set_jmp_reset_offset(s, args[0]);
+-        if (USE_REG_TB) {
+-            /* For the unlinked case, need to reset TCG_REG_TB.  */
+-            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
+-                             -tcg_current_code_size(s));
+-        }
+-        break;
+     case INDEX_op_goto_ptr:
+         tcg_out32(s, MTSPR | RS(args[0]) | CTR);
+         if (USE_REG_TB) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
+     case INDEX_op_exit_tb:   /* Always emitted via tcg_out_exit_tb.  */
++    case INDEX_op_goto_tb:   /* Always emitted via tcg_out_goto_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/riscv/tcg-target.c.inc
++++ b/tcg/riscv/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+     }
+ }
++static void tcg_out_goto_tb(TCGContext *s, int which)
++{
++    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
++    /* indirect jump method */
++    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
++               get_jmp_target_addr(s, which));
++    tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
++    set_jmp_reset_offset(s, which);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     int c2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_goto_tb:
+-        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+-        /* indirect jump method */
+-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
+-                   get_jmp_target_addr(s, a0));
+-        tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+-        set_jmp_reset_offset(s, a0);
+-        break;
+-
+     case INDEX_op_goto_ptr:
+         tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, a0, 0);
+         break;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
++    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
+     default:
+         g_assert_not_reached();
+     }
+diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/s390x/tcg-target.c.inc
++++ b/tcg/s390x/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+     }
+ }
++static void tcg_out_goto_tb(TCGContext *s, int which)
++{
++    /*
++     * Branch displacement must be aligned for atomic patching;
++     * see if we need to add extra nop before branch
++     */
++    if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
++        tcg_out16(s, NOP);
++    }
++    tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
++    set_jmp_insn_offset(s, which);
++    s->code_ptr += 2;
++    set_jmp_reset_offset(s, which);
++}
++
+ # define OP_32_64(x) \
+         case glue(glue(INDEX_op_,x),_i32): \
+         case glue(glue(INDEX_op_,x),_i64)
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     TCGArg a0, a1, a2;
+     switch (opc) {
+-    case INDEX_op_goto_tb:
+-        a0 = args[0];
+-        /*
+-         * branch displacement must be aligned for atomic patching;
+-         * see if we need to add extra nop before branch
+-         */
+-        if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
+-            tcg_out16(s, NOP);
+-        }
+-        tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
+-        set_jmp_insn_offset(s, a0);
+-        s->code_ptr += 2;
+-        set_jmp_reset_offset(s, a0);
+-        break;
+-
+     case INDEX_op_goto_ptr:
+         a0 = args[0];
+         tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, a0);
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
++    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc64/tcg-target.c.inc
++++ b/tcg/sparc64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+     tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
+ }
++static void tcg_out_goto_tb(TCGContext *s, int which)
++{
++    /* Direct jump. */
++    if (USE_REG_TB) {
++        /* make sure the patch is 8-byte aligned.  */
++        if ((intptr_t)s->code_ptr & 4) {
++            tcg_out_nop(s);
++        }
++        set_jmp_insn_offset(s, which);
++        tcg_out_sethi(s, TCG_REG_T1, 0);
++        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
++        tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
++        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
++    } else {
++        set_jmp_insn_offset(s, which);
++        tcg_out32(s, CALL);
++        tcg_out_nop(s);
++    }
++    set_jmp_reset_offset(s, which);
++
++    /*
++     * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
++     * to the beginning of this TB.
++     */
++    if (USE_REG_TB) {
++        int c = -tcg_current_code_size(s);
++        if (check_fit_i32(c, 13)) {
++            tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
++        } else {
++            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
++            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
++        }
++    }
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     c2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_goto_tb:
+-        /* Direct jump. */
+-        if (USE_REG_TB) {
+-            /* make sure the patch is 8-byte aligned.  */
+-            if ((intptr_t)s->code_ptr & 4) {
+-                tcg_out_nop(s);
+-            }
+-            set_jmp_insn_offset(s, a0);
+-            tcg_out_sethi(s, TCG_REG_T1, 0);
+-            tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+-            tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+-            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+-        } else {
+-            set_jmp_insn_offset(s, a0);
+-            tcg_out32(s, CALL);
+-            tcg_out_nop(s);
+-        }
+-        set_jmp_reset_offset(s, a0);
+-
+-        /* For the unlinked path of goto_tb, we need to reset
+-           TCG_REG_TB to the beginning of this TB.  */
+-        if (USE_REG_TB) {
+-            c = -tcg_current_code_size(s);
+-            if (check_fit_i32(c, 13)) {
+-                tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
+-            } else {
+-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
+-                tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB,
+-                              TCG_REG_T1, ARITH_ADD);
+-            }
+-        }
+-        break;
+     case INDEX_op_goto_ptr:
+         tcg_out_arithi(s, TCG_REG_G0, a0, 0, JMPL);
+         if (USE_REG_TB) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
++    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.c.inc
++++ b/tcg/tci/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
+     tcg_out_op_p(s, INDEX_op_exit_tb, (void *)arg);
+ }
++static void tcg_out_goto_tb(TCGContext *s, int which)
++{
++    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
++    /* indirect jump method. */
++    tcg_out_op_p(s, INDEX_op_goto_tb, (void *)get_jmp_target_addr(s, which));
++    set_jmp_reset_offset(s, which);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     TCGOpcode exts;
+     switch (opc) {
+-    case INDEX_op_goto_tb:
+-        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+-        /* indirect jump method. */
+-        tcg_out_op_p(s, opc, (void *)get_jmp_target_addr(s, args[0]));
+-        set_jmp_reset_offset(s, args[0]);
+-        break;
+-
+     case INDEX_op_goto_ptr:
+         tcg_out_op_r(s, opc, args[0]);
+         break;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
++    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
+     default:
+         tcg_abort();
+     }
+--
+.34.1

-New patch
+[PULL 09/22] tcg: Rename TB_JMP_RESET_OFFSET_INVALID to TB_JMP_OFFSET_INVALID
+This will shortly be used for more than reset.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/exec/exec-all.h   | 2 +-
+ accel/tcg/translate-all.c | 8 ++++----
+ tcg/tcg.c                 | 4 ++--
+files changed, 7 insertions(+), 7 deletions(-)
+diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/exec/exec-all.h
++++ b/include/exec/exec-all.h
+@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
+      * setting one of the jump targets (or patching the jump instruction). Only
+      * two of such jumps are supported.
+      */
++#define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
+     uint16_t jmp_reset_offset[2]; /* offset of original jump target */
+-#define TB_JMP_RESET_OFFSET_INVALID 0xffff /* indicates no jump generated */
+     uintptr_t jmp_target_arg[2];  /* target address or offset */
+     /*
+diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/translate-all.c
++++ b/accel/tcg/translate-all.c
+@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
+     tb->jmp_dest[1] = (uintptr_t)NULL;
+     /* init original jump addresses which have been set during tcg_gen_code() */
+-    if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
++    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
+         tb_reset_jump(tb, 0);
+     }
+-    if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
++    if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
+         tb_reset_jump(tb, 1);
+     }
+@@ -XXX,XX +XXX,XX @@ static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
+     if (tb_page_addr1(tb) != -1) {
+         tst->cross_page++;
+     }
+-    if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
++    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
+         tst->direct_jmp_count++;
+-        if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
++        if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
+             tst->direct_jmp2_count++;
+         }
+     }
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
+ #endif
+     /* Initialize goto_tb jump offsets. */
+-    tb->jmp_reset_offset[0] = TB_JMP_RESET_OFFSET_INVALID;
+-    tb->jmp_reset_offset[1] = TB_JMP_RESET_OFFSET_INVALID;
++    tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
++    tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
+     tcg_ctx->tb_jmp_reset_offset = tb->jmp_reset_offset;
+     if (TCG_TARGET_HAS_direct_jump) {
+         tcg_ctx->tb_jmp_insn_offset = tb->jmp_target_arg;
+--
+.34.1

-New patch
+[PULL 10/22] tcg: Add gen_tb to TCGContext
+This can replace four other variables that are references
+into the TranslationBlock structure.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/tcg/tcg.h         | 11 +++--------
+ accel/tcg/translate-all.c |  2 +-
+ tcg/tcg-op.c              | 14 +++++++-------
+ tcg/tcg.c                 | 14 +++-----------
+files changed, 14 insertions(+), 27 deletions(-)
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/tcg/tcg.h
++++ b/include/tcg/tcg.h
+@@ -XXX,XX +XXX,XX @@ struct TCGContext {
+     int nb_indirects;
+     int nb_ops;
+-    /* goto_tb support */
+-    tcg_insn_unit *code_buf;
+-    uint16_t *tb_jmp_reset_offset; /* tb->jmp_reset_offset */
+-    uintptr_t *tb_jmp_insn_offset; /* tb->jmp_target_arg if direct_jump */
+-    uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_arg if !direct_jump */
+-
+     TCGRegSet reserved_regs;
+-    uint32_t tb_cflags; /* cflags of the current TB */
+     intptr_t current_frame_offset;
+     intptr_t frame_start;
+     intptr_t frame_end;
+     TCGTemp *frame_temp;
+-    tcg_insn_unit *code_ptr;
++    TranslationBlock *gen_tb;     /* tb for which code is being generated */
++    tcg_insn_unit *code_buf;      /* pointer for start of tb */
++    tcg_insn_unit *code_ptr;      /* pointer for running end of tb */
+ #ifdef CONFIG_PROFILER
+     TCGProfile prof;
+diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/translate-all.c
++++ b/accel/tcg/translate-all.c
+@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
+     tb->trace_vcpu_dstate = *cpu->trace_dstate;
+     tb_set_page_addr0(tb, phys_pc);
+     tb_set_page_addr1(tb, -1);
+-    tcg_ctx->tb_cflags = cflags;
++    tcg_ctx->gen_tb = tb;
+  tb_overflow:
+ #ifdef CONFIG_PROFILER
+diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg-op.c
++++ b/tcg/tcg-op.c
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_op6(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3,
+ void tcg_gen_mb(TCGBar mb_type)
+ {
+-    if (tcg_ctx->tb_cflags & CF_PARALLEL) {
++    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {
+         tcg_gen_op1(INDEX_op_mb, mb_type);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_exit_tb(const TranslationBlock *tb, unsigned idx)
+ void tcg_gen_goto_tb(unsigned idx)
+ {
+     /* We tested CF_NO_GOTO_TB in translator_use_goto_tb. */
+-    tcg_debug_assert(!(tcg_ctx->tb_cflags & CF_NO_GOTO_TB));
++    tcg_debug_assert(!(tcg_ctx->gen_tb->cflags & CF_NO_GOTO_TB));
+     /* We only support two chained exits.  */
+     tcg_debug_assert(idx <= TB_EXIT_IDXMAX);
+ #ifdef CONFIG_DEBUG_TCG
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_lookup_and_goto_ptr(void)
+ {
+     TCGv_ptr ptr;
+-    if (tcg_ctx->tb_cflags & CF_NO_GOTO_PTR) {
++    if (tcg_ctx->gen_tb->cflags & CF_NO_GOTO_PTR) {
+         tcg_gen_exit_tb(NULL, 0);
+         return;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
+ {
+     memop = tcg_canonicalize_memop(memop, 0, 0);
+-    if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) {
++    if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) {
+         TCGv_i32 t1 = tcg_temp_new_i32();
+         TCGv_i32 t2 = tcg_temp_new_i32();
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
+ {
+     memop = tcg_canonicalize_memop(memop, 1, 0);
+-    if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) {
++    if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) {
+         TCGv_i64 t1 = tcg_temp_new_i64();
+         TCGv_i64 t2 = tcg_temp_new_i64();
+@@ -XXX,XX +XXX,XX @@ static void * const table_##NAME[(MO_SIZE | MO_BSWAP) + 1] = {          \
+ void tcg_gen_atomic_##NAME##_i32                                        \
+     (TCGv_i32 ret, TCGv addr, TCGv_i32 val, TCGArg idx, MemOp memop)    \
+ {                                                                       \
+-    if (tcg_ctx->tb_cflags & CF_PARALLEL) {                             \
++    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {                        \
+         do_atomic_op_i32(ret, addr, val, idx, memop, table_##NAME);     \
+     } else {                                                            \
+         do_nonatomic_op_i32(ret, addr, val, idx, memop, NEW,            \
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_##NAME##_i32                                        \
+ void tcg_gen_atomic_##NAME##_i64                                        \
+     (TCGv_i64 ret, TCGv addr, TCGv_i64 val, TCGArg idx, MemOp memop)    \
+ {                                                                       \
+-    if (tcg_ctx->tb_cflags & CF_PARALLEL) {                             \
++    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {                        \
+         do_atomic_op_i64(ret, addr, val, idx, memop, table_##NAME);     \
+     } else {                                                            \
+         do_nonatomic_op_i64(ret, addr, val, idx, memop, NEW,            \
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
+      * We will check for overflow at the end of the opcode loop in
+      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
+      */
+-    s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
++    s->gen_tb->jmp_reset_offset[which] = tcg_current_code_size(s);
+ }
+ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
+@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
+      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
+      */
+     tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
+-    s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
++    s->gen_tb->jmp_target_arg[which] = tcg_current_code_size(s);
+ }
+ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
+@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
+      * Return the read-execute version of the pointer, for the benefit
+      * of any pc-relative addressing mode.
+      */
+-    return (uintptr_t)tcg_splitwx_to_rx(&s->tb_jmp_target_addr[which]);
++    return (uintptr_t)tcg_splitwx_to_rx(s->gen_tb->jmp_target_arg + which);
+ }
+ /* Signal overflow, starting over with fewer guest insns. */
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
+     /* Initialize goto_tb jump offsets. */
+     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
+     tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
+-    tcg_ctx->tb_jmp_reset_offset = tb->jmp_reset_offset;
+-    if (TCG_TARGET_HAS_direct_jump) {
+-        tcg_ctx->tb_jmp_insn_offset = tb->jmp_target_arg;
+-        tcg_ctx->tb_jmp_target_addr = NULL;
+-    } else {
+-        tcg_ctx->tb_jmp_insn_offset = NULL;
+-        tcg_ctx->tb_jmp_target_addr = tb->jmp_target_arg;
+-    }
+     tcg_reg_alloc_start(s);
+--
+.34.1

-[PULL 3/3] accel/tcg: rename tcg-cpus functions to match module name
+[PULL 11/22] tcg: Add TranslationBlock.jmp_insn_offset
-From: Claudio Fontana <cfontana@suse.de>
+Stop overloading jmp_target_arg for both offset and address,
 depending on TCG_TARGET_HAS_direct_jump.  Instead, add a new
 field to hold the jump insn offset and always set the target
 address in jmp_target_addr[].  This will allow a tcg backend
 to use either direct or indirect depending on displacement.
-Signed-off-by: Claudio Fontana <cfontana@suse.de>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20201015143217.29337-4-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-cpus-icount.h |  6 +--
+ include/exec/exec-all.h | 3 ++-
- accel/tcg/tcg-cpus-rr.h     |  2 +-
+ accel/tcg/cpu-exec.c    | 5 ++---
- accel/tcg/tcg-cpus.h        |  6 +--
+ tcg/tcg.c               | 6 ++++--
- accel/tcg/tcg-cpus-icount.c | 24 ++++++------
+files changed, 8 insertions(+), 6 deletions(-)
  accel/tcg/tcg-cpus-mttcg.c  | 10 ++---
  accel/tcg/tcg-cpus-rr.c     | 74 ++++++++++++++++++-------------------
  accel/tcg/tcg-cpus.c        |  6 +--
 files changed, 64 insertions(+), 64 deletions(-)
-diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
+diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-cpus-icount.h
+--- a/include/exec/exec-all.h
-+++ b/accel/tcg/tcg-cpus-icount.h
++++ b/include/exec/exec-all.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
- #ifndef TCG_CPUS_ICOUNT_H
+      */
- #define TCG_CPUS_ICOUNT_H
+ #define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
+     uint16_t jmp_reset_offset[2]; /* offset of original jump target */
--void handle_icount_deadline(void);
+-    uintptr_t jmp_target_arg[2];  /* target address or offset */
--void prepare_icount_for_run(CPUState *cpu);
++    uint16_t jmp_insn_offset[2];  /* offset of direct jump insn */
--void process_icount_data(CPUState *cpu);
++    uintptr_t jmp_target_addr[2]; /* target address */
-+void icount_handle_deadline(void);
-+void icount_prepare_for_run(CPUState *cpu);
+     /*
-+void icount_process_data(CPUState *cpu);
+      * Each TB has a NULL-terminated list (jmp_list_head) of incoming jumps.
+diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
  #endif /* TCG_CPUS_ICOUNT_H */
 diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-cpus-rr.h
+--- a/accel/tcg/cpu-exec.c
-+++ b/accel/tcg/tcg-cpus-rr.h
++++ b/accel/tcg/cpu-exec.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
- #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
  /* Kick all RR vCPUs. */
 -void qemu_cpu_kick_rr_cpus(CPUState *unused);
 +void rr_kick_vcpu_thread(CPUState *unused);
  /* start the round robin vcpu thread */
  void rr_start_vcpu_thread(CPUState *cpu);
 diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.h
 +++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
  extern const CpusAccel tcg_cpus_icount;
  extern const CpusAccel tcg_cpus_rr;
 -void qemu_tcg_destroy_vcpu(CPUState *cpu);
 -int tcg_cpu_exec(CPUState *cpu);
 -void tcg_handle_interrupt(CPUState *cpu, int mask);
 +void tcg_cpus_destroy(CPUState *cpu);
 +int tcg_cpus_exec(CPUState *cpu);
 +void tcg_cpus_handle_interrupt(CPUState *cpu, int mask);
  #endif /* TCG_CPUS_H */
 diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-icount.c
 +++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg-cpus-icount.h"
  #include "tcg-cpus-rr.h"
 -static int64_t tcg_get_icount_limit(void)
 +static int64_t icount_get_limit(void)
  {
-     int64_t deadline;
++    tb->jmp_target_addr[n] = addr;
+     if (TCG_TARGET_HAS_direct_jump) {
-@@ -XXX,XX +XXX,XX @@ static int64_t tcg_get_icount_limit(void)
+-        uintptr_t offset = tb->jmp_target_arg[n];
 +        uintptr_t offset = tb->jmp_insn_offset[n];
          uintptr_t tc_ptr = (uintptr_t)tb->tc.ptr;
          uintptr_t jmp_rx = tc_ptr + offset;
          uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
          tb_target_set_jmp_target(tc_ptr, jmp_rx, jmp_rw, addr);
 -    } else {
 -        tb->jmp_target_arg[n] = addr;
      }
  }
--static void notify_aio_contexts(void)
+diff --git a/tcg/tcg.c b/tcg/tcg.c
-+static void icount_notify_aio_contexts(void)
+index XXXXXXX..XXXXXXX 100644
- {
+--- a/tcg/tcg.c
-     /* Wake up other AioContexts.  */
++++ b/tcg/tcg.c
-     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
-     qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
+      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
       */
      tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
 -    s->gen_tb->jmp_target_arg[which] = tcg_current_code_size(s);
 +    s->gen_tb->jmp_insn_offset[which] = tcg_current_code_size(s);
  }
--void handle_icount_deadline(void)
+ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
-+void icount_handle_deadline(void)
+@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
- {
+      * Return the read-execute version of the pointer, for the benefit
-     assert(qemu_in_vcpu_thread());
+      * of any pc-relative addressing mode.
-     int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+      */
-                                                   QEMU_TIMER_ATTR_ALL);
+-    return (uintptr_t)tcg_splitwx_to_rx(s->gen_tb->jmp_target_arg + which);
++    return (uintptr_t)tcg_splitwx_to_rx(&s->gen_tb->jmp_target_addr[which]);
      if (deadline == 0) {
 -        notify_aio_contexts();
 +        icount_notify_aio_contexts();
      }
  }
--void prepare_icount_for_run(CPUState *cpu)
+ /* Signal overflow, starting over with fewer guest insns. */
-+void icount_prepare_for_run(CPUState *cpu)
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
- {
+     /* Initialize goto_tb jump offsets. */
-     int insns_left;
+     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
+     tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
-     /*
++    tb->jmp_insn_offset[0] = TB_JMP_OFFSET_INVALID;
--     * These should always be cleared by process_icount_data after
++    tb->jmp_insn_offset[1] = TB_JMP_OFFSET_INVALID;
-+     * These should always be cleared by icount_process_data after
-      * each vCPU execution. However u16.high can be raised
+     tcg_reg_alloc_start(s);
 -     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
 +     * asynchronously by cpu_exit/cpu_interrupt/tcg_cpus_handle_interrupt
       */
      g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
      g_assert(cpu->icount_extra == 0);
 -    cpu->icount_budget = tcg_get_icount_limit();
 +    cpu->icount_budget = icount_get_limit();
      insns_left = MIN(0xffff, cpu->icount_budget);
      cpu_neg(cpu)->icount_decr.u16.low = insns_left;
      cpu->icount_extra = cpu->icount_budget - insns_left;
@@ -XXX,XX +XXX,XX @@ void prepare_icount_for_run(CPUState *cpu)
      replay_mutex_lock();
      if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
 -        notify_aio_contexts();
 +        icount_notify_aio_contexts();
      }
  }
 -void process_icount_data(CPUState *cpu)
 +void icount_process_data(CPUState *cpu)
  {
      /* Account for executed instructions */
      icount_update(cpu);
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
  {
      int old_mask = cpu->interrupt_request;
 -    tcg_handle_interrupt(cpu, mask);
 +    tcg_cpus_handle_interrupt(cpu, mask);
      if (qemu_cpu_is_self(cpu) &&
          !cpu->can_do_io
          && (mask & ~old_mask) != 0) {
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
  const CpusAccel tcg_cpus_icount = {
      .create_vcpu_thread = rr_start_vcpu_thread,
 -    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 +    .kick_vcpu_thread = rr_kick_vcpu_thread,
      .handle_interrupt = icount_handle_interrupt,
      .get_virtual_clock = icount_get,
 diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-mttcg.c
 +++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
   * current CPUState for a given thread.
   */
 -static void *tcg_cpu_thread_fn(void *arg)
 +static void *mttcg_cpu_thread_fn(void *arg)
  {
      CPUState *cpu = arg;
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
          if (cpu_can_run(cpu)) {
              int r;
              qemu_mutex_unlock_iothread();
 -            r = tcg_cpu_exec(cpu);
 +            r = tcg_cpus_exec(cpu);
              qemu_mutex_lock_iothread();
              switch (r) {
              case EXCP_DEBUG:
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
          qemu_wait_io_event(cpu);
      } while (!cpu->unplug || cpu_can_run(cpu));
 -    qemu_tcg_destroy_vcpu(cpu);
 +    tcg_cpus_destroy(cpu);
      qemu_mutex_unlock_iothread();
      rcu_unregister_thread();
      return NULL;
@@ -XXX,XX +XXX,XX @@ static void mttcg_start_vcpu_thread(CPUState *cpu)
      snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
               cpu->cpu_index);
 -    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
 +    qemu_thread_create(cpu->thread, thread_name, mttcg_cpu_thread_fn,
                         cpu, QEMU_THREAD_JOINABLE);
  #ifdef _WIN32
@@ -XXX,XX +XXX,XX @@ const CpusAccel tcg_cpus_mttcg = {
      .create_vcpu_thread = mttcg_start_vcpu_thread,
      .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 -    .handle_interrupt = tcg_handle_interrupt,
 +    .handle_interrupt = tcg_cpus_handle_interrupt,
  };
 diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-rr.c
 +++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg-cpus-icount.h"
  /* Kick all RR vCPUs */
 -void qemu_cpu_kick_rr_cpus(CPUState *unused)
 +void rr_kick_vcpu_thread(CPUState *unused)
  {
      CPUState *cpu;
@@ -XXX,XX +XXX,XX @@ void qemu_cpu_kick_rr_cpus(CPUState *unused)
   * idleness is complete.
   */
 -static QEMUTimer *tcg_kick_vcpu_timer;
 -static CPUState *tcg_current_rr_cpu;
 +static QEMUTimer *rr_kick_vcpu_timer;
 +static CPUState *rr_current_cpu;
  #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 -static inline int64_t qemu_tcg_next_kick(void)
 +static inline int64_t rr_next_kick_time(void)
  {
      return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
  }
  /* Kick the currently round-robin scheduled vCPU to next */
 -static void qemu_cpu_kick_rr_next_cpu(void)
 +static void rr_kick_next_cpu(void)
  {
      CPUState *cpu;
      do {
 -        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
 +        cpu = qatomic_mb_read(&rr_current_cpu);
          if (cpu) {
              cpu_exit(cpu);
          }
 -    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
 +    } while (cpu != qatomic_mb_read(&rr_current_cpu));
  }
 -static void kick_tcg_thread(void *opaque)
 +static void rr_kick_thread(void *opaque)
  {
 -    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 -    qemu_cpu_kick_rr_next_cpu();
 +    timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
 +    rr_kick_next_cpu();
  }
 -static void start_tcg_kick_timer(void)
 +static void rr_start_kick_timer(void)
  {
 -    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 -        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 -                                           kick_tcg_thread, NULL);
 +    if (!rr_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 +        rr_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 +                                           rr_kick_thread, NULL);
      }
 -    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 +    if (rr_kick_vcpu_timer && !timer_pending(rr_kick_vcpu_timer)) {
 +        timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
      }
  }
 -static void stop_tcg_kick_timer(void)
 +static void rr_stop_kick_timer(void)
  {
 -    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_del(tcg_kick_vcpu_timer);
 +    if (rr_kick_vcpu_timer && timer_pending(rr_kick_vcpu_timer)) {
 +        timer_del(rr_kick_vcpu_timer);
      }
  }
 -static void qemu_tcg_rr_wait_io_event(void)
 +static void rr_wait_io_event(void)
  {
      CPUState *cpu;
      while (all_cpu_threads_idle()) {
 -        stop_tcg_kick_timer();
 +        rr_stop_kick_timer();
          qemu_cond_wait_iothread(first_cpu->halt_cond);
      }
 -    start_tcg_kick_timer();
 +    rr_start_kick_timer();
      CPU_FOREACH(cpu) {
          qemu_wait_io_event_common(cpu);
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_rr_wait_io_event(void)
   * Destroy any remaining vCPUs which have been unplugged and have
   * finished running
   */
 -static void deal_with_unplugged_cpus(void)
 +static void rr_deal_with_unplugged_cpus(void)
  {
      CPUState *cpu;
      CPU_FOREACH(cpu) {
          if (cpu->unplug && !cpu_can_run(cpu)) {
 -            qemu_tcg_destroy_vcpu(cpu);
 +            tcg_cpus_destroy(cpu);
              break;
          }
      }
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
   * elsewhere.
   */
 -static void *tcg_rr_cpu_thread_fn(void *arg)
 +static void *rr_cpu_thread_fn(void *arg)
  {
      CPUState *cpu = arg;
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
          }
      }
 -    start_tcg_kick_timer();
 +    rr_start_kick_timer();
      cpu = first_cpu;
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
               * Run the timers here.  This is much more efficient than
               * waking up the I/O thread and waiting for completion.
               */
 -            handle_icount_deadline();
 +            icount_handle_deadline();
          }
          replay_mutex_unlock();
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
          while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 -            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
 +            qatomic_mb_set(&rr_current_cpu, cpu);
              current_cpu = cpu;
              qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
                  qemu_mutex_unlock_iothread();
                  if (icount_enabled()) {
 -                    prepare_icount_for_run(cpu);
 +                    icount_prepare_for_run(cpu);
                  }
 -                r = tcg_cpu_exec(cpu);
 +                r = tcg_cpus_exec(cpu);
                  if (icount_enabled()) {
 -                    process_icount_data(cpu);
 +                    icount_process_data(cpu);
                  }
                  qemu_mutex_lock_iothread();
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
          } /* while (cpu && !cpu->exit_request).. */
          /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
 -        qatomic_set(&tcg_current_rr_cpu, NULL);
 +        qatomic_set(&rr_current_cpu, NULL);
          if (cpu && cpu->exit_request) {
              qatomic_mb_set(&cpu->exit_request, 0);
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
              qemu_notify_event();
          }
 -        qemu_tcg_rr_wait_io_event();
 -        deal_with_unplugged_cpus();
 +        rr_wait_io_event();
 +        rr_deal_with_unplugged_cpus();
      }
      rcu_unregister_thread();
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
          /* share a single thread for all cpus with TCG */
          snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
          qemu_thread_create(cpu->thread, thread_name,
 -                           tcg_rr_cpu_thread_fn,
 +                           rr_cpu_thread_fn,
                             cpu, QEMU_THREAD_JOINABLE);
          single_tcg_halt_cond = cpu->halt_cond;
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
  const CpusAccel tcg_cpus_rr = {
      .create_vcpu_thread = rr_start_vcpu_thread,
 -    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 +    .kick_vcpu_thread = rr_kick_vcpu_thread,
 -    .handle_interrupt = tcg_handle_interrupt,
 +    .handle_interrupt = tcg_cpus_handle_interrupt,
  };
 diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.c
 +++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
  /* common functionality among all TCG variants */
 -void qemu_tcg_destroy_vcpu(CPUState *cpu)
 +void tcg_cpus_destroy(CPUState *cpu)
  {
      cpu_thread_signal_destroyed(cpu);
  }
 -int tcg_cpu_exec(CPUState *cpu)
 +int tcg_cpus_exec(CPUState *cpu)
  {
      int ret;
  #ifdef CONFIG_PROFILER
@@ -XXX,XX +XXX,XX @@ int tcg_cpu_exec(CPUState *cpu)
  }
  /* mask must never be zero, except for A20 change call */
 -void tcg_handle_interrupt(CPUState *cpu, int mask)
 +void tcg_cpus_handle_interrupt(CPUState *cpu, int mask)
  {
      g_assert(qemu_mutex_iothread_locked());
 --
-.25.1
+.34.1

-New patch
+[PULL 12/22] tcg: Change tb_target_set_jmp_target arguments
+Replace 'tc_ptr' and 'addr' with 'tb' and 'n'.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/aarch64/tcg-target.h         |  3 ++-
+ tcg/arm/tcg-target.h             |  3 ++-
+ tcg/i386/tcg-target.h            |  9 ++-------
+ tcg/loongarch64/tcg-target.h     |  3 ++-
+ tcg/mips/tcg-target.h            |  3 ++-
+ tcg/ppc/tcg-target.h             |  3 ++-
+ tcg/riscv/tcg-target.h           |  3 ++-
+ tcg/s390x/tcg-target.h           | 10 ++--------
+ tcg/sparc64/tcg-target.h         |  3 ++-
+ tcg/tci/tcg-target.h             |  3 ++-
+ accel/tcg/cpu-exec.c             | 11 ++++++++---
+ tcg/aarch64/tcg-target.c.inc     |  5 +++--
+ tcg/i386/tcg-target.c.inc        |  9 +++++++++
+ tcg/loongarch64/tcg-target.c.inc |  5 +++--
+ tcg/ppc/tcg-target.c.inc         |  7 ++++---
+ tcg/s390x/tcg-target.c.inc       | 10 ++++++++++
+ tcg/sparc64/tcg-target.c.inc     |  7 ++++---
+files changed, 61 insertions(+), 36 deletions(-)
+diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.h
++++ b/tcg/aarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     0
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *, int,
++                              uintptr_t, uintptr_t);
+ #define TCG_TARGET_NEED_LDST_LABELS
+ #define TCG_TARGET_NEED_POOL_LABELS
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.h
++++ b/tcg/arm/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     0
+ /* not defined -- call should be eliminated at compile time */
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t);
+ #define TCG_TARGET_NEED_LDST_LABELS
+ #define TCG_TARGET_NEED_POOL_LABELS
+diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.h
++++ b/tcg/i386/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
+ #define TCG_TARGET_extract_i64_valid(ofs, len) \
+     (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
+-static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+-                                            uintptr_t jmp_rw, uintptr_t addr)
+-{
+-    /* patch the branch destination */
+-    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
+-    /* no need to flush icache explicitly */
+-}
++void tb_target_set_jmp_target(const TranslationBlock *, int,
++                              uintptr_t, uintptr_t);
+ /* This defines the natural memory order supported by this
+  * architecture before guarantees made by various barrier
+diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.h
++++ b/tcg/loongarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_HAS_muluh_i64        1
+ #define TCG_TARGET_HAS_mulsh_i64        1
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t);
+ #define TCG_TARGET_DEFAULT_MO (0)
+diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/mips/tcg-target.h
++++ b/tcg/mips/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     1
+ /* not defined -- call should be eliminated at compile time */
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t)
+     QEMU_ERROR("code path is reachable");
+ #define TCG_TARGET_NEED_LDST_LABELS
+diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.h
++++ b/tcg/ppc/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
+ #define TCG_TARGET_HAS_bitsel_vec       have_vsx
+ #define TCG_TARGET_HAS_cmpsel_vec       0
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t);
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     1
+diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/riscv/tcg-target.h
++++ b/tcg/riscv/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #endif
+ /* not defined -- call should be eliminated at compile time */
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t);
+ #define TCG_TARGET_DEFAULT_MO (0)
+diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/s390x/tcg-target.h
++++ b/tcg/s390x/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
+ #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
+-static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+-                                            uintptr_t jmp_rw, uintptr_t addr)
+-{
+-    /* patch the branch destination */
+-    intptr_t disp = addr - (jmp_rx - 2);
+-    qatomic_set((int32_t *)jmp_rw, disp / 2);
+-    /* no need to flush icache explicitly */
+-}
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw);
+ #define TCG_TARGET_NEED_LDST_LABELS
+ #define TCG_TARGET_NEED_POOL_LABELS
+diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc64/tcg-target.h
++++ b/tcg/sparc64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     1
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t);
+ #define TCG_TARGET_NEED_POOL_LABELS
+diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.h
++++ b/tcg/tci/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     1
+ /* not defined -- call should be eliminated at compile time */
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t);
+ #endif /* TCG_TARGET_H */
+diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/cpu-exec.c
++++ b/accel/tcg/cpu-exec.c
+@@ -XXX,XX +XXX,XX @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
+ {
+     tb->jmp_target_addr[n] = addr;
+     if (TCG_TARGET_HAS_direct_jump) {
++        /*
++         * Get the rx view of the structure, from which we find the
++         * executable code address, and tb_target_set_jmp_target can
++         * produce a pc-relative displacement to jmp_target_addr[n].
++         */
++        const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
+         uintptr_t offset = tb->jmp_insn_offset[n];
+-        uintptr_t tc_ptr = (uintptr_t)tb->tc.ptr;
+-        uintptr_t jmp_rx = tc_ptr + offset;
++        uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
+         uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
+-        tb_target_set_jmp_target(tc_ptr, jmp_rx, jmp_rw, addr);
++        tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
+     }
+ }
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/aarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
+     tcg_out_call_int(s, target);
+ }
+-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+-                              uintptr_t jmp_rw, uintptr_t addr)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+ {
++    uintptr_t addr = tb->jmp_target_addr[n];
+     tcg_insn_unit i1, i2;
+     TCGType rt = TCG_TYPE_I64;
+     TCGReg  rd = TCG_REG_TMP;
+diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.c.inc
++++ b/tcg/i386/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
+     set_jmp_reset_offset(s, which);
+ }
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
++{
++    /* patch the branch destination */
++    uintptr_t addr = tb->jmp_target_addr[n];
++    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
++    /* no need to flush icache explicitly */
++}
++
+ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                               const TCGArg args[TCG_MAX_OP_ARGS],
+                               const int const_args[TCG_MAX_OP_ARGS])
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop(TCGContext *s)
+     tcg_out32(s, NOP);
+ }
+-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+-                              uintptr_t jmp_rw, uintptr_t addr)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+ {
+     tcg_insn_unit i1, i2;
+     ptrdiff_t upper, lower;
++    uintptr_t addr = tb->jmp_target_addr[n];
+     ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
+     if (offset == sextreg(offset, 0, 26)) {
+diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.c.inc
++++ b/tcg/ppc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
+     flush_idcache_range(rx, rw, 16);
+ }
+-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+-                              uintptr_t jmp_rw, uintptr_t addr)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+ {
+     tcg_insn_unit i0, i1, i2, i3;
+-    intptr_t tb_diff = addr - tc_ptr;
++    uintptr_t addr = tb->jmp_target_addr[n];
++    intptr_t tb_diff = addr - (uintptr_t)tb->tc.ptr;
+     intptr_t br_diff = addr - (jmp_rx + 4);
+     intptr_t lo, hi;
+diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/s390x/tcg-target.c.inc
++++ b/tcg/s390x/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
+     set_jmp_reset_offset(s, which);
+ }
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
++{
++    /* patch the branch destination */
++    uintptr_t addr = tb->jmp_target_addr[n];
++    intptr_t disp = addr - (jmp_rx - 2);
++    qatomic_set((int32_t *)jmp_rw, disp / 2);
++    /* no need to flush icache explicitly */
++}
++
+ # define OP_32_64(x) \
+         case glue(glue(INDEX_op_,x),_i32): \
+         case glue(glue(INDEX_op_,x),_i64)
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc64/tcg-target.c.inc
++++ b/tcg/sparc64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ void tcg_register_jit(const void *buf, size_t buf_size)
+     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
+ }
+-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+-                              uintptr_t jmp_rw, uintptr_t addr)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+ {
+-    intptr_t tb_disp = addr - tc_ptr;
++    uintptr_t addr = tb->jmp_target_addr[n];
++    intptr_t tb_disp = addr - (uintptr_t)tb->tc.ptr;
+     intptr_t br_disp = addr - jmp_rx;
+     tcg_insn_unit i1, i2;
+--
+.34.1

-New patch
+[PULL 13/22] tcg: Move tb_target_set_jmp_target declaration to tcg.h
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/tcg/tcg.h            | 3 +++
+ tcg/aarch64/tcg-target.h     | 4 ----
+ tcg/arm/tcg-target.h         | 5 -----
+ tcg/i386/tcg-target.h        | 3 ---
+ tcg/loongarch64/tcg-target.h | 3 ---
+ tcg/mips/tcg-target.h        | 5 -----
+ tcg/ppc/tcg-target.h         | 4 ----
+ tcg/riscv/tcg-target.h       | 4 ----
+ tcg/s390x/tcg-target.h       | 4 ----
+ tcg/sparc64/tcg-target.h     | 4 ----
+ tcg/tci/tcg-target.h         | 4 ----
+files changed, 3 insertions(+), 40 deletions(-)
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/tcg/tcg.h
++++ b/include/tcg/tcg.h
+@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s);
+ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start);
++void tb_target_set_jmp_target(const TranslationBlock *, int,
++                              uintptr_t, uintptr_t);
++
+ void tcg_set_frame(TCGContext *s, TCGReg reg, intptr_t start, intptr_t size);
+ TCGTemp *tcg_global_mem_new_internal(TCGType, TCGv_ptr,
+diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.h
++++ b/tcg/aarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     0
+-
+-void tb_target_set_jmp_target(const TranslationBlock *, int,
+-                              uintptr_t, uintptr_t);
+-
+ #define TCG_TARGET_NEED_LDST_LABELS
+ #define TCG_TARGET_NEED_POOL_LABELS
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.h
++++ b/tcg/arm/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     0
+-
+-/* not defined -- call should be eliminated at compile time */
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+-                              uintptr_t, uintptr_t);
+-
+ #define TCG_TARGET_NEED_LDST_LABELS
+ #define TCG_TARGET_NEED_POOL_LABELS
+diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.h
++++ b/tcg/i386/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
+ #define TCG_TARGET_extract_i64_valid(ofs, len) \
+     (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
+-void tb_target_set_jmp_target(const TranslationBlock *, int,
+-                              uintptr_t, uintptr_t);
+-
+ /* This defines the natural memory order supported by this
+  * architecture before guarantees made by various barrier
+  * instructions.
+diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.h
++++ b/tcg/loongarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_HAS_muluh_i64        1
+ #define TCG_TARGET_HAS_mulsh_i64        1
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+-                              uintptr_t, uintptr_t);
+-
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_NEED_LDST_LABELS
+diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/mips/tcg-target.h
++++ b/tcg/mips/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     1
+-/* not defined -- call should be eliminated at compile time */
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+-                              uintptr_t, uintptr_t)
+-    QEMU_ERROR("code path is reachable");
+-
+ #define TCG_TARGET_NEED_LDST_LABELS
+ #endif
+diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.h
++++ b/tcg/ppc/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
+ #define TCG_TARGET_HAS_bitsel_vec       have_vsx
+ #define TCG_TARGET_HAS_cmpsel_vec       0
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+-                              uintptr_t, uintptr_t);
+-
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     1
+-
+ #define TCG_TARGET_NEED_LDST_LABELS
+ #define TCG_TARGET_NEED_POOL_LABELS
+diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/riscv/tcg-target.h
++++ b/tcg/riscv/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_HAS_mulsh_i64        1
+ #endif
+-/* not defined -- call should be eliminated at compile time */
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+-                              uintptr_t, uintptr_t);
+-
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_NEED_LDST_LABELS
+diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/s390x/tcg-target.h
++++ b/tcg/s390x/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
+ #define TCG_TARGET_HAS_MEMORY_BSWAP   1
+ #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
+-
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+-                              uintptr_t jmp_rx, uintptr_t jmp_rw);
+-
+ #define TCG_TARGET_NEED_LDST_LABELS
+ #define TCG_TARGET_NEED_POOL_LABELS
+diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc64/tcg-target.h
++++ b/tcg/sparc64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     1
+-
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+-                              uintptr_t, uintptr_t);
+-
+ #define TCG_TARGET_NEED_POOL_LABELS
+ #endif
+diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.h
++++ b/tcg/tci/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     1
+-/* not defined -- call should be eliminated at compile time */
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+-                              uintptr_t, uintptr_t);
+-
+ #endif /* TCG_TARGET_H */
+--
+.34.1

-[PULL 1/3] accel/tcg: split CpusAccel into three TCG variants
+[PULL 14/22] tcg: Always define tb_target_set_jmp_target
-From: Claudio Fontana <cfontana@suse.de>
+Install empty versions for !TCG_TARGET_HAS_direct_jump hosts.
-split up the CpusAccel tcg_cpus into three TCG variants:
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 tcg_cpus_rr (single threaded, round robin cpus)
 tcg_cpus_icount (same as rr, but with instruction counting enabled)
 tcg_cpus_mttcg (multi-threaded cpus)
 Suggested-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20201015143217.29337-2-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-cpus-icount.h |  17 ++
+ tcg/arm/tcg-target.c.inc   | 6 ++++++
- accel/tcg/tcg-cpus-mttcg.h  |  21 ++
+ tcg/mips/tcg-target.c.inc  | 6 ++++++
- accel/tcg/tcg-cpus-rr.h     |  20 ++
+ tcg/riscv/tcg-target.c.inc | 6 ++++++
- accel/tcg/tcg-cpus.h        |  13 +-
+ tcg/tci/tcg-target.c.inc   | 6 ++++++
- accel/tcg/tcg-all.c         |   8 +-
+files changed, 24 insertions(+)
  accel/tcg/tcg-cpus-icount.c | 147 +++++++++++
  accel/tcg/tcg-cpus-mttcg.c  | 117 +++++++++
  accel/tcg/tcg-cpus-rr.c     | 270 ++++++++++++++++++++
  accel/tcg/tcg-cpus.c        | 484 ++----------------------------------
  softmmu/icount.c            |   2 +-
  accel/tcg/meson.build       |   9 +-
 files changed, 646 insertions(+), 462 deletions(-)
  create mode 100644 accel/tcg/tcg-cpus-icount.h
  create mode 100644 accel/tcg/tcg-cpus-mttcg.h
  create mode 100644 accel/tcg/tcg-cpus-rr.h
  create mode 100644 accel/tcg/tcg-cpus-icount.c
  create mode 100644 accel/tcg/tcg-cpus-mttcg.c
  create mode 100644 accel/tcg/tcg-cpus-rr.c
-diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-icount.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation using instruction counting
 + *
 + * Copyright 2020 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef TCG_CPUS_ICOUNT_H
 +#define TCG_CPUS_ICOUNT_H
 +
 +void handle_icount_deadline(void);
 +void prepare_icount_for_run(CPUState *cpu);
 +void process_icount_data(CPUState *cpu);
 +
 +#endif /* TCG_CPUS_ICOUNT_H */
 diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-mttcg.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Multi Threaded vCPUs implementation
 + *
 + * Copyright 2020 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef TCG_CPUS_MTTCG_H
 +#define TCG_CPUS_MTTCG_H
 +
 +/*
 + * In the multi-threaded case each vCPU has its own thread. The TLS
 + * variable current_cpu can be used deep in the code to find the
 + * current CPUState for a given thread.
 + */
 +
 +void *tcg_cpu_thread_fn(void *arg);
 +
 +#endif /* TCG_CPUS_MTTCG_H */
 diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation
 + *
 + * Copyright 2020 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef TCG_CPUS_RR_H
 +#define TCG_CPUS_RR_H
 +
 +#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 +
 +/* Kick all RR vCPUs. */
 +void qemu_cpu_kick_rr_cpus(CPUState *unused);
 +
 +void *tcg_rr_cpu_thread_fn(void *arg);
 +
 +#endif /* TCG_CPUS_RR_H */
 diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-cpus.h
+--- a/tcg/arm/tcg-target.c.inc
-+++ b/accel/tcg/tcg-cpus.h
++++ b/tcg/arm/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
- /*
+     set_jmp_reset_offset(s, which);
 - * Accelerator CPUS Interface
 + * QEMU TCG vCPU common functionality
 + *
 + * Functionality common to all TCG vcpu variants: mttcg, rr and icount.
   *
   * Copyright 2020 SUSE LLC
   *
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/cpus.h"
 -extern const CpusAccel tcg_cpus;
 +extern const CpusAccel tcg_cpus_mttcg;
 +extern const CpusAccel tcg_cpus_icount;
 +extern const CpusAccel tcg_cpus_rr;
 +
 +void tcg_start_vcpu_thread(CPUState *cpu);
 +void qemu_tcg_destroy_vcpu(CPUState *cpu);
 +int tcg_cpu_exec(CPUState *cpu);
 +void tcg_handle_interrupt(CPUState *cpu, int mask);
  #endif /* TCG_CPUS_H */
 diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-all.c
 +++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
      tcg_exec_init(s->tb_size * 1024 * 1024);
      mttcg_enabled = s->mttcg_enabled;
 -    cpus_register_accel(&tcg_cpus);
 +    if (mttcg_enabled) {
 +        cpus_register_accel(&tcg_cpus_mttcg);
 +    } else if (icount_enabled()) {
 +        cpus_register_accel(&tcg_cpus_icount);
 +    } else {
 +        cpus_register_accel(&tcg_cpus_rr);
 +    }
      return 0;
  }
-diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-new file mode 100644
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation using instruction counting
 + *
 + * Copyright (c) 2003-2008 Fabrice Bellard
 + * Copyright (c) 2014 Red Hat Inc.
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu-common.h"
 +#include "sysemu/tcg.h"
 +#include "sysemu/replay.h"
 +#include "qemu/main-loop.h"
 +#include "qemu/guest-random.h"
 +#include "exec/exec-all.h"
 +#include "hw/boards.h"
 +
 +#include "tcg-cpus.h"
 +#include "tcg-cpus-icount.h"
 +#include "tcg-cpus-rr.h"
 +
 +static int64_t tcg_get_icount_limit(void)
 +{
-+    int64_t deadline;
++    /* Always indirect, nothing to do */
 +
 +    if (replay_mode != REPLAY_MODE_PLAY) {
 +        /*
 +         * Include all the timers, because they may need an attention.
 +         * Too long CPU execution may create unnecessary delay in UI.
 +         */
 +        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 +                                              QEMU_TIMER_ATTR_ALL);
 +        /* Check realtime timers, because they help with input processing */
 +        deadline = qemu_soonest_timeout(deadline,
 +                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
 +                                           QEMU_TIMER_ATTR_ALL));
 +
 +        /*
 +         * Maintain prior (possibly buggy) behaviour where if no deadline
 +         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
 +         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
 +         * nanoseconds.
 +         */
 +        if ((deadline < 0) || (deadline > INT32_MAX)) {
 +            deadline = INT32_MAX;
 +        }
 +
 +        return icount_round(deadline);
 +    } else {
 +        return replay_get_instructions();
 +    }
 +}
 +
-+static void notify_aio_contexts(void)
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
      set_jmp_reset_offset(s, which);
  }
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
-+    /* Wake up other AioContexts.  */
++    /* Always indirect, nothing to do */
 +    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 +    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 +}
 +
-+void handle_icount_deadline(void)
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
      set_jmp_reset_offset(s, which);
  }
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
-+    assert(qemu_in_vcpu_thread());
++    /* Always indirect, nothing to do */
 +    int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 +                                                  QEMU_TIMER_ATTR_ALL);
 +
 +    if (deadline == 0) {
 +        notify_aio_contexts();
 +    }
 +}
 +
-+void prepare_icount_for_run(CPUState *cpu)
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
      set_jmp_reset_offset(s, which);
  }
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
-+    int insns_left;
++    /* Always indirect, nothing to do */
 +
 +    /*
 +     * These should always be cleared by process_icount_data after
 +     * each vCPU execution. However u16.high can be raised
 +     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
 +     */
 +    g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
 +    g_assert(cpu->icount_extra == 0);
 +
 +    cpu->icount_budget = tcg_get_icount_limit();
 +    insns_left = MIN(0xffff, cpu->icount_budget);
 +    cpu_neg(cpu)->icount_decr.u16.low = insns_left;
 +    cpu->icount_extra = cpu->icount_budget - insns_left;
 +
 +    replay_mutex_lock();
 +
 +    if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
 +        notify_aio_contexts();
 +    }
 +}
 +
-+void process_icount_data(CPUState *cpu)
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-+{
+                        const TCGArg args[TCG_MAX_OP_ARGS],
-+    /* Account for executed instructions */
+                        const int const_args[TCG_MAX_OP_ARGS])
 +    icount_update(cpu);
 +
 +    /* Reset the counters */
 +    cpu_neg(cpu)->icount_decr.u16.low = 0;
 +    cpu->icount_extra = 0;
 +    cpu->icount_budget = 0;
 +
 +    replay_account_executed_instructions();
 +
 +    replay_mutex_unlock();
 +}
 +
 +static void icount_handle_interrupt(CPUState *cpu, int mask)
 +{
 +    int old_mask = cpu->interrupt_request;
 +
 +    tcg_handle_interrupt(cpu, mask);
 +    if (qemu_cpu_is_self(cpu) &&
 +        !cpu->can_do_io
 +        && (mask & ~old_mask) != 0) {
 +        cpu_abort(cpu, "Raised interrupt while not in I/O function");
 +    }
 +}
 +
 +const CpusAccel tcg_cpus_icount = {
 +    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 +
 +    .handle_interrupt = icount_handle_interrupt,
 +    .get_virtual_clock = icount_get,
 +    .get_elapsed_ticks = icount_get,
 +};
 diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Multi Threaded vCPUs implementation
 + *
 + * Copyright (c) 2003-2008 Fabrice Bellard
 + * Copyright (c) 2014 Red Hat Inc.
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu-common.h"
 +#include "sysemu/tcg.h"
 +#include "sysemu/replay.h"
 +#include "qemu/main-loop.h"
 +#include "qemu/guest-random.h"
 +#include "exec/exec-all.h"
 +#include "hw/boards.h"
 +
 +#include "tcg-cpus.h"
 +#include "tcg-cpus-mttcg.h"
 +
 +/*
 + * In the multi-threaded case each vCPU has its own thread. The TLS
 + * variable current_cpu can be used deep in the code to find the
 + * current CPUState for a given thread.
 + */
 +
 +void *tcg_cpu_thread_fn(void *arg)
 +{
 +    CPUState *cpu = arg;
 +
 +    assert(tcg_enabled());
 +    g_assert(!icount_enabled());
 +
 +    rcu_register_thread();
 +    tcg_register_thread();
 +
 +    qemu_mutex_lock_iothread();
 +    qemu_thread_get_self(cpu->thread);
 +
 +    cpu->thread_id = qemu_get_thread_id();
 +    cpu->can_do_io = 1;
 +    current_cpu = cpu;
 +    cpu_thread_signal_created(cpu);
 +    qemu_guest_random_seed_thread_part2(cpu->random_seed);
 +
 +    /* process any pending work */
 +    cpu->exit_request = 1;
 +
 +    do {
 +        if (cpu_can_run(cpu)) {
 +            int r;
 +            qemu_mutex_unlock_iothread();
 +            r = tcg_cpu_exec(cpu);
 +            qemu_mutex_lock_iothread();
 +            switch (r) {
 +            case EXCP_DEBUG:
 +                cpu_handle_guest_debug(cpu);
 +                break;
 +            case EXCP_HALTED:
 +                /*
 +                 * during start-up the vCPU is reset and the thread is
 +                 * kicked several times. If we don't ensure we go back
 +                 * to sleep in the halted state we won't cleanly
 +                 * start-up when the vCPU is enabled.
 +                 *
 +                 * cpu->halted should ensure we sleep in wait_io_event
 +                 */
 +                g_assert(cpu->halted);
 +                break;
 +            case EXCP_ATOMIC:
 +                qemu_mutex_unlock_iothread();
 +                cpu_exec_step_atomic(cpu);
 +                qemu_mutex_lock_iothread();
 +            default:
 +                /* Ignore everything else? */
 +                break;
 +            }
 +        }
 +
 +        qatomic_mb_set(&cpu->exit_request, 0);
 +        qemu_wait_io_event(cpu);
 +    } while (!cpu->unplug || cpu_can_run(cpu));
 +
 +    qemu_tcg_destroy_vcpu(cpu);
 +    qemu_mutex_unlock_iothread();
 +    rcu_unregister_thread();
 +    return NULL;
 +}
 +
 +static void mttcg_kick_vcpu_thread(CPUState *cpu)
 +{
 +    cpu_exit(cpu);
 +}
 +
 +const CpusAccel tcg_cpus_mttcg = {
 +    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 +
 +    .handle_interrupt = tcg_handle_interrupt,
 +};
 diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation
 + *
 + * Copyright (c) 2003-2008 Fabrice Bellard
 + * Copyright (c) 2014 Red Hat Inc.
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu-common.h"
 +#include "sysemu/tcg.h"
 +#include "sysemu/replay.h"
 +#include "qemu/main-loop.h"
 +#include "qemu/guest-random.h"
 +#include "exec/exec-all.h"
 +#include "hw/boards.h"
 +
 +#include "tcg-cpus.h"
 +#include "tcg-cpus-rr.h"
 +#include "tcg-cpus-icount.h"
 +
 +/* Kick all RR vCPUs */
 +void qemu_cpu_kick_rr_cpus(CPUState *unused)
 +{
 +    CPUState *cpu;
 +
 +    CPU_FOREACH(cpu) {
 +        cpu_exit(cpu);
 +    };
 +}
 +
 +/*
 + * TCG vCPU kick timer
 + *
 + * The kick timer is responsible for moving single threaded vCPU
 + * emulation on to the next vCPU. If more than one vCPU is running a
 + * timer event with force a cpu->exit so the next vCPU can get
 + * scheduled.
 + *
 + * The timer is removed if all vCPUs are idle and restarted again once
 + * idleness is complete.
 + */
 +
 +static QEMUTimer *tcg_kick_vcpu_timer;
 +static CPUState *tcg_current_rr_cpu;
 +
 +#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 +
 +static inline int64_t qemu_tcg_next_kick(void)
 +{
 +    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 +}
 +
 +/* Kick the currently round-robin scheduled vCPU to next */
 +static void qemu_cpu_kick_rr_next_cpu(void)
 +{
 +    CPUState *cpu;
 +    do {
 +        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
 +        if (cpu) {
 +            cpu_exit(cpu);
 +        }
 +    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
 +}
 +
 +static void kick_tcg_thread(void *opaque)
 +{
 +    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 +    qemu_cpu_kick_rr_next_cpu();
 +}
 +
 +static void start_tcg_kick_timer(void)
 +{
 +    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 +        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 +                                           kick_tcg_thread, NULL);
 +    }
 +    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 +        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 +    }
 +}
 +
 +static void stop_tcg_kick_timer(void)
 +{
 +    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 +        timer_del(tcg_kick_vcpu_timer);
 +    }
 +}
 +
 +static void qemu_tcg_rr_wait_io_event(void)
 +{
 +    CPUState *cpu;
 +
 +    while (all_cpu_threads_idle()) {
 +        stop_tcg_kick_timer();
 +        qemu_cond_wait_iothread(first_cpu->halt_cond);
 +    }
 +
 +    start_tcg_kick_timer();
 +
 +    CPU_FOREACH(cpu) {
 +        qemu_wait_io_event_common(cpu);
 +    }
 +}
 +
 +/*
 + * Destroy any remaining vCPUs which have been unplugged and have
 + * finished running
 + */
 +static void deal_with_unplugged_cpus(void)
 +{
 +    CPUState *cpu;
 +
 +    CPU_FOREACH(cpu) {
 +        if (cpu->unplug && !cpu_can_run(cpu)) {
 +            qemu_tcg_destroy_vcpu(cpu);
 +            break;
 +        }
 +    }
 +}
 +
 +/*
 + * In the single-threaded case each vCPU is simulated in turn. If
 + * there is more than a single vCPU we create a simple timer to kick
 + * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
 + * This is done explicitly rather than relying on side-effects
 + * elsewhere.
 + */
 +
 +void *tcg_rr_cpu_thread_fn(void *arg)
 +{
 +    CPUState *cpu = arg;
 +
 +    assert(tcg_enabled());
 +    rcu_register_thread();
 +    tcg_register_thread();
 +
 +    qemu_mutex_lock_iothread();
 +    qemu_thread_get_self(cpu->thread);
 +
 +    cpu->thread_id = qemu_get_thread_id();
 +    cpu->can_do_io = 1;
 +    cpu_thread_signal_created(cpu);
 +    qemu_guest_random_seed_thread_part2(cpu->random_seed);
 +
 +    /* wait for initial kick-off after machine start */
 +    while (first_cpu->stopped) {
 +        qemu_cond_wait_iothread(first_cpu->halt_cond);
 +
 +        /* process any pending work */
 +        CPU_FOREACH(cpu) {
 +            current_cpu = cpu;
 +            qemu_wait_io_event_common(cpu);
 +        }
 +    }
 +
 +    start_tcg_kick_timer();
 +
 +    cpu = first_cpu;
 +
 +    /* process any pending work */
 +    cpu->exit_request = 1;
 +
 +    while (1) {
 +        qemu_mutex_unlock_iothread();
 +        replay_mutex_lock();
 +        qemu_mutex_lock_iothread();
 +
 +        if (icount_enabled()) {
 +            /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
 +            icount_account_warp_timer();
 +            /*
 +             * Run the timers here.  This is much more efficient than
 +             * waking up the I/O thread and waiting for completion.
 +             */
 +            handle_icount_deadline();
 +        }
 +
 +        replay_mutex_unlock();
 +
 +        if (!cpu) {
 +            cpu = first_cpu;
 +        }
 +
 +        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 +
 +            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
 +            current_cpu = cpu;
 +
 +            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
 +                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
 +
 +            if (cpu_can_run(cpu)) {
 +                int r;
 +
 +                qemu_mutex_unlock_iothread();
 +                if (icount_enabled()) {
 +                    prepare_icount_for_run(cpu);
 +                }
 +                r = tcg_cpu_exec(cpu);
 +                if (icount_enabled()) {
 +                    process_icount_data(cpu);
 +                }
 +                qemu_mutex_lock_iothread();
 +
 +                if (r == EXCP_DEBUG) {
 +                    cpu_handle_guest_debug(cpu);
 +                    break;
 +                } else if (r == EXCP_ATOMIC) {
 +                    qemu_mutex_unlock_iothread();
 +                    cpu_exec_step_atomic(cpu);
 +                    qemu_mutex_lock_iothread();
 +                    break;
 +                }
 +            } else if (cpu->stop) {
 +                if (cpu->unplug) {
 +                    cpu = CPU_NEXT(cpu);
 +                }
 +                break;
 +            }
 +
 +            cpu = CPU_NEXT(cpu);
 +        } /* while (cpu && !cpu->exit_request).. */
 +
 +        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
 +        qatomic_set(&tcg_current_rr_cpu, NULL);
 +
 +        if (cpu && cpu->exit_request) {
 +            qatomic_mb_set(&cpu->exit_request, 0);
 +        }
 +
 +        if (icount_enabled() && all_cpu_threads_idle()) {
 +            /*
 +             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
 +             * in the main_loop, wake it up in order to start the warp timer.
 +             */
 +            qemu_notify_event();
 +        }
 +
 +        qemu_tcg_rr_wait_io_event();
 +        deal_with_unplugged_cpus();
 +    }
 +
 +    rcu_unregister_thread();
 +    return NULL;
 +}
 +
 +const CpusAccel tcg_cpus_rr = {
 +    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 +
 +    .handle_interrupt = tcg_handle_interrupt,
 +};
 diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.c
 +++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
  /*
 - * QEMU System Emulator
 + * QEMU TCG vCPU common functionality
 + *
 + * Functionality common to all TCG vCPU variants: mttcg, rr and icount.
   *
   * Copyright (c) 2003-2008 Fabrice Bellard
   * Copyright (c) 2014 Red Hat Inc.
@@ -XXX,XX +XXX,XX @@
  #include "hw/boards.h"
  #include "tcg-cpus.h"
 +#include "tcg-cpus-mttcg.h"
 +#include "tcg-cpus-rr.h"
 -/* Kick all RR vCPUs */
 -static void qemu_cpu_kick_rr_cpus(void)
 -{
 -    CPUState *cpu;
 +/* common functionality among all TCG variants */
 -    CPU_FOREACH(cpu) {
 -        cpu_exit(cpu);
 -    };
 -}
 -
 -static void tcg_kick_vcpu_thread(CPUState *cpu)
 -{
 -    if (qemu_tcg_mttcg_enabled()) {
 -        cpu_exit(cpu);
 -    } else {
 -        qemu_cpu_kick_rr_cpus();
 -    }
 -}
 -
 -/*
 - * TCG vCPU kick timer
 - *
 - * The kick timer is responsible for moving single threaded vCPU
 - * emulation on to the next vCPU. If more than one vCPU is running a
 - * timer event with force a cpu->exit so the next vCPU can get
 - * scheduled.
 - *
 - * The timer is removed if all vCPUs are idle and restarted again once
 - * idleness is complete.
 - */
 -
 -static QEMUTimer *tcg_kick_vcpu_timer;
 -static CPUState *tcg_current_rr_cpu;
 -
 -#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 -
 -static inline int64_t qemu_tcg_next_kick(void)
 -{
 -    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 -}
 -
 -/* Kick the currently round-robin scheduled vCPU to next */
 -static void qemu_cpu_kick_rr_next_cpu(void)
 -{
 -    CPUState *cpu;
 -    do {
 -        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
 -        if (cpu) {
 -            cpu_exit(cpu);
 -        }
 -    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
 -}
 -
 -static void kick_tcg_thread(void *opaque)
 -{
 -    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 -    qemu_cpu_kick_rr_next_cpu();
 -}
 -
 -static void start_tcg_kick_timer(void)
 -{
 -    assert(!mttcg_enabled);
 -    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 -        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 -                                           kick_tcg_thread, NULL);
 -    }
 -    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 -    }
 -}
 -
 -static void stop_tcg_kick_timer(void)
 -{
 -    assert(!mttcg_enabled);
 -    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_del(tcg_kick_vcpu_timer);
 -    }
 -}
 -
 -static void qemu_tcg_destroy_vcpu(CPUState *cpu)
 -{
 -}
 -
 -static void qemu_tcg_rr_wait_io_event(void)
 -{
 -    CPUState *cpu;
 -
 -    while (all_cpu_threads_idle()) {
 -        stop_tcg_kick_timer();
 -        qemu_cond_wait_iothread(first_cpu->halt_cond);
 -    }
 -
 -    start_tcg_kick_timer();
 -
 -    CPU_FOREACH(cpu) {
 -        qemu_wait_io_event_common(cpu);
 -    }
 -}
 -
 -static int64_t tcg_get_icount_limit(void)
 -{
 -    int64_t deadline;
 -
 -    if (replay_mode != REPLAY_MODE_PLAY) {
 -        /*
 -         * Include all the timers, because they may need an attention.
 -         * Too long CPU execution may create unnecessary delay in UI.
 -         */
 -        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 -                                              QEMU_TIMER_ATTR_ALL);
 -        /* Check realtime timers, because they help with input processing */
 -        deadline = qemu_soonest_timeout(deadline,
 -                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
 -                                           QEMU_TIMER_ATTR_ALL));
 -
 -        /*
 -         * Maintain prior (possibly buggy) behaviour where if no deadline
 -         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
 -         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
 -         * nanoseconds.
 -         */
 -        if ((deadline < 0) || (deadline > INT32_MAX)) {
 -            deadline = INT32_MAX;
 -        }
 -
 -        return icount_round(deadline);
 -    } else {
 -        return replay_get_instructions();
 -    }
 -}
 -
 -static void notify_aio_contexts(void)
 -{
 -    /* Wake up other AioContexts.  */
 -    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 -    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 -}
 -
 -static void handle_icount_deadline(void)
 -{
 -    assert(qemu_in_vcpu_thread());
 -    if (icount_enabled()) {
 -        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 -                                                      QEMU_TIMER_ATTR_ALL);
 -
 -        if (deadline == 0) {
 -            notify_aio_contexts();
 -        }
 -    }
 -}
 -
 -static void prepare_icount_for_run(CPUState *cpu)
 -{
 -    if (icount_enabled()) {
 -        int insns_left;
 -
 -        /*
 -         * These should always be cleared by process_icount_data after
 -         * each vCPU execution. However u16.high can be raised
 -         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
 -         */
 -        g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
 -        g_assert(cpu->icount_extra == 0);
 -
 -        cpu->icount_budget = tcg_get_icount_limit();
 -        insns_left = MIN(0xffff, cpu->icount_budget);
 -        cpu_neg(cpu)->icount_decr.u16.low = insns_left;
 -        cpu->icount_extra = cpu->icount_budget - insns_left;
 -
 -        replay_mutex_lock();
 -
 -        if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
 -            notify_aio_contexts();
 -        }
 -    }
 -}
 -
 -static void process_icount_data(CPUState *cpu)
 -{
 -    if (icount_enabled()) {
 -        /* Account for executed instructions */
 -        icount_update(cpu);
 -
 -        /* Reset the counters */
 -        cpu_neg(cpu)->icount_decr.u16.low = 0;
 -        cpu->icount_extra = 0;
 -        cpu->icount_budget = 0;
 -
 -        replay_account_executed_instructions();
 -
 -        replay_mutex_unlock();
 -    }
 -}
 -
 -static int tcg_cpu_exec(CPUState *cpu)
 -{
 -    int ret;
 -#ifdef CONFIG_PROFILER
 -    int64_t ti;
 -#endif
 -
 -    assert(tcg_enabled());
 -#ifdef CONFIG_PROFILER
 -    ti = profile_getclock();
 -#endif
 -    cpu_exec_start(cpu);
 -    ret = cpu_exec(cpu);
 -    cpu_exec_end(cpu);
 -#ifdef CONFIG_PROFILER
 -    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
 -                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
 -#endif
 -    return ret;
 -}
 -
 -/*
 - * Destroy any remaining vCPUs which have been unplugged and have
 - * finished running
 - */
 -static void deal_with_unplugged_cpus(void)
 -{
 -    CPUState *cpu;
 -
 -    CPU_FOREACH(cpu) {
 -        if (cpu->unplug && !cpu_can_run(cpu)) {
 -            qemu_tcg_destroy_vcpu(cpu);
 -            cpu_thread_signal_destroyed(cpu);
 -            break;
 -        }
 -    }
 -}
 -
 -/*
 - * Single-threaded TCG
 - *
 - * In the single-threaded case each vCPU is simulated in turn. If
 - * there is more than a single vCPU we create a simple timer to kick
 - * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
 - * This is done explicitly rather than relying on side-effects
 - * elsewhere.
 - */
 -
 -static void *tcg_rr_cpu_thread_fn(void *arg)
 -{
 -    CPUState *cpu = arg;
 -
 -    assert(tcg_enabled());
 -    rcu_register_thread();
 -    tcg_register_thread();
 -
 -    qemu_mutex_lock_iothread();
 -    qemu_thread_get_self(cpu->thread);
 -
 -    cpu->thread_id = qemu_get_thread_id();
 -    cpu->can_do_io = 1;
 -    cpu_thread_signal_created(cpu);
 -    qemu_guest_random_seed_thread_part2(cpu->random_seed);
 -
 -    /* wait for initial kick-off after machine start */
 -    while (first_cpu->stopped) {
 -        qemu_cond_wait_iothread(first_cpu->halt_cond);
 -
 -        /* process any pending work */
 -        CPU_FOREACH(cpu) {
 -            current_cpu = cpu;
 -            qemu_wait_io_event_common(cpu);
 -        }
 -    }
 -
 -    start_tcg_kick_timer();
 -
 -    cpu = first_cpu;
 -
 -    /* process any pending work */
 -    cpu->exit_request = 1;
 -
 -    while (1) {
 -        qemu_mutex_unlock_iothread();
 -        replay_mutex_lock();
 -        qemu_mutex_lock_iothread();
 -        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
 -        icount_account_warp_timer();
 -
 -        /*
 -         * Run the timers here.  This is much more efficient than
 -         * waking up the I/O thread and waiting for completion.
 -         */
 -        handle_icount_deadline();
 -
 -        replay_mutex_unlock();
 -
 -        if (!cpu) {
 -            cpu = first_cpu;
 -        }
 -
 -        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 -
 -            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
 -            current_cpu = cpu;
 -
 -            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
 -                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
 -
 -            if (cpu_can_run(cpu)) {
 -                int r;
 -
 -                qemu_mutex_unlock_iothread();
 -                prepare_icount_for_run(cpu);
 -
 -                r = tcg_cpu_exec(cpu);
 -
 -                process_icount_data(cpu);
 -                qemu_mutex_lock_iothread();
 -
 -                if (r == EXCP_DEBUG) {
 -                    cpu_handle_guest_debug(cpu);
 -                    break;
 -                } else if (r == EXCP_ATOMIC) {
 -                    qemu_mutex_unlock_iothread();
 -                    cpu_exec_step_atomic(cpu);
 -                    qemu_mutex_lock_iothread();
 -                    break;
 -                }
 -            } else if (cpu->stop) {
 -                if (cpu->unplug) {
 -                    cpu = CPU_NEXT(cpu);
 -                }
 -                break;
 -            }
 -
 -            cpu = CPU_NEXT(cpu);
 -        } /* while (cpu && !cpu->exit_request).. */
 -
 -        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
 -        qatomic_set(&tcg_current_rr_cpu, NULL);
 -
 -        if (cpu && cpu->exit_request) {
 -            qatomic_mb_set(&cpu->exit_request, 0);
 -        }
 -
 -        if (icount_enabled() && all_cpu_threads_idle()) {
 -            /*
 -             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
 -             * in the main_loop, wake it up in order to start the warp timer.
 -             */
 -            qemu_notify_event();
 -        }
 -
 -        qemu_tcg_rr_wait_io_event();
 -        deal_with_unplugged_cpus();
 -    }
 -
 -    rcu_unregister_thread();
 -    return NULL;
 -}
 -
 -/*
 - * Multi-threaded TCG
 - *
 - * In the multi-threaded case each vCPU has its own thread. The TLS
 - * variable current_cpu can be used deep in the code to find the
 - * current CPUState for a given thread.
 - */
 -
 -static void *tcg_cpu_thread_fn(void *arg)
 -{
 -    CPUState *cpu = arg;
 -
 -    assert(tcg_enabled());
 -    g_assert(!icount_enabled());
 -
 -    rcu_register_thread();
 -    tcg_register_thread();
 -
 -    qemu_mutex_lock_iothread();
 -    qemu_thread_get_self(cpu->thread);
 -
 -    cpu->thread_id = qemu_get_thread_id();
 -    cpu->can_do_io = 1;
 -    current_cpu = cpu;
 -    cpu_thread_signal_created(cpu);
 -    qemu_guest_random_seed_thread_part2(cpu->random_seed);
 -
 -    /* process any pending work */
 -    cpu->exit_request = 1;
 -
 -    do {
 -        if (cpu_can_run(cpu)) {
 -            int r;
 -            qemu_mutex_unlock_iothread();
 -            r = tcg_cpu_exec(cpu);
 -            qemu_mutex_lock_iothread();
 -            switch (r) {
 -            case EXCP_DEBUG:
 -                cpu_handle_guest_debug(cpu);
 -                break;
 -            case EXCP_HALTED:
 -                /*
 -                 * during start-up the vCPU is reset and the thread is
 -                 * kicked several times. If we don't ensure we go back
 -                 * to sleep in the halted state we won't cleanly
 -                 * start-up when the vCPU is enabled.
 -                 *
 -                 * cpu->halted should ensure we sleep in wait_io_event
 -                 */
 -                g_assert(cpu->halted);
 -                break;
 -            case EXCP_ATOMIC:
 -                qemu_mutex_unlock_iothread();
 -                cpu_exec_step_atomic(cpu);
 -                qemu_mutex_lock_iothread();
 -            default:
 -                /* Ignore everything else? */
 -                break;
 -            }
 -        }
 -
 -        qatomic_mb_set(&cpu->exit_request, 0);
 -        qemu_wait_io_event(cpu);
 -    } while (!cpu->unplug || cpu_can_run(cpu));
 -
 -    qemu_tcg_destroy_vcpu(cpu);
 -    cpu_thread_signal_destroyed(cpu);
 -    qemu_mutex_unlock_iothread();
 -    rcu_unregister_thread();
 -    return NULL;
 -}
 -
 -static void tcg_start_vcpu_thread(CPUState *cpu)
 +void tcg_start_vcpu_thread(CPUState *cpu)
  {
      char thread_name[VCPU_THREAD_NAME_SIZE];
      static QemuCond *single_tcg_halt_cond;
@@ -XXX,XX +XXX,XX @@ static void tcg_start_vcpu_thread(CPUState *cpu)
      }
  }
 -static int64_t tcg_get_virtual_clock(void)
 +void qemu_tcg_destroy_vcpu(CPUState *cpu)
  {
 -    if (icount_enabled()) {
 -        return icount_get();
 -    }
 -    return cpu_get_clock();
 +    cpu_thread_signal_destroyed(cpu);
  }
 -static int64_t tcg_get_elapsed_ticks(void)
 +int tcg_cpu_exec(CPUState *cpu)
  {
 -    if (icount_enabled()) {
 -        return icount_get();
 -    }
 -    return cpu_get_ticks();
 +    int ret;
 +#ifdef CONFIG_PROFILER
 +    int64_t ti;
 +#endif
 +    assert(tcg_enabled());
 +#ifdef CONFIG_PROFILER
 +    ti = profile_getclock();
 +#endif
 +    cpu_exec_start(cpu);
 +    ret = cpu_exec(cpu);
 +    cpu_exec_end(cpu);
 +#ifdef CONFIG_PROFILER
 +    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
 +                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
 +#endif
 +    return ret;
  }
  /* mask must never be zero, except for A20 change call */
 -static void tcg_handle_interrupt(CPUState *cpu, int mask)
 +void tcg_handle_interrupt(CPUState *cpu, int mask)
  {
 -    int old_mask;
      g_assert(qemu_mutex_iothread_locked());
 -    old_mask = cpu->interrupt_request;
      cpu->interrupt_request |= mask;
      /*
@@ -XXX,XX +XXX,XX @@ static void tcg_handle_interrupt(CPUState *cpu, int mask)
          qemu_cpu_kick(cpu);
      } else {
          qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
 -        if (icount_enabled() &&
 -            !cpu->can_do_io
 -            && (mask & ~old_mask) != 0) {
 -            cpu_abort(cpu, "Raised interrupt while not in I/O function");
 -        }
      }
  }
 -
 -const CpusAccel tcg_cpus = {
 -    .create_vcpu_thread = tcg_start_vcpu_thread,
 -    .kick_vcpu_thread = tcg_kick_vcpu_thread,
 -
 -    .handle_interrupt = tcg_handle_interrupt,
 -
 -    .get_virtual_clock = tcg_get_virtual_clock,
 -    .get_elapsed_ticks = tcg_get_elapsed_ticks,
 -};
 diff --git a/softmmu/icount.c b/softmmu/icount.c
 index XXXXXXX..XXXXXXX 100644
 --- a/softmmu/icount.c
 +++ b/softmmu/icount.c
@@ -XXX,XX +XXX,XX @@ void icount_start_warp_timer(void)
  void icount_account_warp_timer(void)
  {
 -    if (!icount_enabled() || !icount_sleep) {
 +    if (!icount_sleep) {
          return;
      }
 diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/meson.build
 +++ b/accel/tcg/meson.build
@@ -XXX,XX +XXX,XX @@ tcg_ss.add(when: 'CONFIG_SOFTMMU', if_false: files('user-exec-stub.c'))
  tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c'), libdl])
  specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
 -specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files('tcg-all.c', 'cputlb.c', 'tcg-cpus.c'))
 +specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
 +  'tcg-all.c',
 +  'cputlb.c',
 +  'tcg-cpus.c',
 +  'tcg-cpus-mttcg.c',
 +  'tcg-cpus-icount.c',
 +  'tcg-cpus-rr.c'
 +))
 --
-.25.1
+.34.1

-New patch
+[PULL 15/22] tcg: Remove TCG_TARGET_HAS_direct_jump
+We now have the option to generate direct or indirect
 goto_tb depending on the dynamic displacement, thus
 the define is no longer necessary or completely accurate.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/aarch64/tcg-target.h     |  1 -
  tcg/arm/tcg-target.h         |  1 -
  tcg/i386/tcg-target.h        |  1 -
  tcg/loongarch64/tcg-target.h |  1 -
  tcg/mips/tcg-target.h        |  1 -
  tcg/ppc/tcg-target.h         |  1 -
  tcg/riscv/tcg-target.h       |  1 -
  tcg/s390x/tcg-target.h       |  1 -
  tcg/sparc64/tcg-target.h     |  1 -
  tcg/tci/tcg-target.h         |  1 -
  accel/tcg/cpu-exec.c         | 23 +++++++++++------------
  tcg/tcg.c                    |  1 -
  tcg/arm/tcg-target.c.inc     |  1 -
  tcg/mips/tcg-target.c.inc    |  1 -
  tcg/riscv/tcg-target.c.inc   |  1 -
  tcg/s390x/tcg-target.c.inc   |  3 +++
  tcg/tci/tcg-target.c.inc     |  1 -
 files changed, 14 insertions(+), 27 deletions(-)
 diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.h
 +++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_muls2_i64        0
  #define TCG_TARGET_HAS_muluh_i64        1
  #define TCG_TARGET_HAS_mulsh_i64        1
 -#define TCG_TARGET_HAS_direct_jump      1
  #define TCG_TARGET_HAS_v64              1
  #define TCG_TARGET_HAS_v128             1
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
  #define TCG_TARGET_HAS_mulsh_i32        0
  #define TCG_TARGET_HAS_div_i32          use_idiv_instructions
  #define TCG_TARGET_HAS_rem_i32          0
 -#define TCG_TARGET_HAS_direct_jump      0
  #define TCG_TARGET_HAS_qemu_st8_i32     0
  #define TCG_TARGET_HAS_v64              use_neon_instructions
 diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.h
 +++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
  #define TCG_TARGET_HAS_muls2_i32        1
  #define TCG_TARGET_HAS_muluh_i32        0
  #define TCG_TARGET_HAS_mulsh_i32        0
 -#define TCG_TARGET_HAS_direct_jump      1
  #if TCG_TARGET_REG_BITS == 64
  /* Keep target addresses zero-extended in a register.  */
 diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.h
 +++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_clz_i32          1
  #define TCG_TARGET_HAS_ctz_i32          1
  #define TCG_TARGET_HAS_ctpop_i32        0
 -#define TCG_TARGET_HAS_direct_jump      1
  #define TCG_TARGET_HAS_brcond2          0
  #define TCG_TARGET_HAS_setcond2         0
  #define TCG_TARGET_HAS_qemu_st8_i32     0
 diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.h
 +++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
  #define TCG_TARGET_HAS_muluh_i32        1
  #define TCG_TARGET_HAS_mulsh_i32        1
  #define TCG_TARGET_HAS_bswap32_i32      1
 -#define TCG_TARGET_HAS_direct_jump      0
  #if TCG_TARGET_REG_BITS == 64
  #define TCG_TARGET_HAS_add2_i32         0
 diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.h
 +++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
  #define TCG_TARGET_HAS_muls2_i32        0
  #define TCG_TARGET_HAS_muluh_i32        1
  #define TCG_TARGET_HAS_mulsh_i32        1
 -#define TCG_TARGET_HAS_direct_jump      1
  #define TCG_TARGET_HAS_qemu_st8_i32     0
  #if TCG_TARGET_REG_BITS == 64
 diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.h
 +++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_clz_i32          0
  #define TCG_TARGET_HAS_ctz_i32          0
  #define TCG_TARGET_HAS_ctpop_i32        0
 -#define TCG_TARGET_HAS_direct_jump      0
  #define TCG_TARGET_HAS_brcond2          1
  #define TCG_TARGET_HAS_setcond2         1
  #define TCG_TARGET_HAS_qemu_st8_i32     0
 diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.h
 +++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
  #define TCG_TARGET_HAS_mulsh_i32      0
  #define TCG_TARGET_HAS_extrl_i64_i32  0
  #define TCG_TARGET_HAS_extrh_i64_i32  0
 -#define TCG_TARGET_HAS_direct_jump    1
  #define TCG_TARGET_HAS_qemu_st8_i32   0
  #define TCG_TARGET_HAS_div2_i64       1
 diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.h
 +++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
  #define TCG_TARGET_HAS_muls2_i32        1
  #define TCG_TARGET_HAS_muluh_i32        0
  #define TCG_TARGET_HAS_mulsh_i32        0
 -#define TCG_TARGET_HAS_direct_jump      1
  #define TCG_TARGET_HAS_qemu_st8_i32     0
  #define TCG_TARGET_HAS_extrl_i64_i32    1
 diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.h
 +++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@
  #define TCG_TARGET_HAS_muls2_i32        1
  #define TCG_TARGET_HAS_muluh_i32        0
  #define TCG_TARGET_HAS_mulsh_i32        0
 -#define TCG_TARGET_HAS_direct_jump      0
  #define TCG_TARGET_HAS_qemu_st8_i32     0
  #if TCG_TARGET_REG_BITS == 64
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
  void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
  {
 +    /*
 +     * Get the rx view of the structure, from which we find the
 +     * executable code address, and tb_target_set_jmp_target can
 +     * produce a pc-relative displacement to jmp_target_addr[n].
 +     */
 +    const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
 +    uintptr_t offset = tb->jmp_insn_offset[n];
 +    uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
 +    uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
 +
      tb->jmp_target_addr[n] = addr;
 -    if (TCG_TARGET_HAS_direct_jump) {
 -        /*
 -         * Get the rx view of the structure, from which we find the
 -         * executable code address, and tb_target_set_jmp_target can
 -         * produce a pc-relative displacement to jmp_target_addr[n].
 -         */
 -        const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
 -        uintptr_t offset = tb->jmp_insn_offset[n];
 -        uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
 -        uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
 -        tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
 -    }
 +    tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
  }
  static inline void tb_add_jump(TranslationBlock *tb, int n,
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
       * We will check for overflow at the end of the opcode loop in
       * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
       */
 -    tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
      s->gen_tb->jmp_insn_offset[which] = tcg_current_code_size(s);
  }
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
      intptr_t ptr, dif, dil;
      TCGReg base = TCG_REG_PC;
 -    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
      ptr = get_jmp_target_addr(s, which);
      dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
      dil = sextract32(dif, 0, 12);
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
  static void tcg_out_goto_tb(TCGContext *s, int which)
  {
      /* indirect jump method */
 -    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
      tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
                 get_jmp_target_addr(s, which));
      tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
  static void tcg_out_goto_tb(TCGContext *s, int which)
  {
 -    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
      /* indirect jump method */
      tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
                 get_jmp_target_addr(s, which));
 diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.c.inc
 +++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
  void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
                                uintptr_t jmp_rx, uintptr_t jmp_rw)
  {
 +    if (!HAVE_FACILITY(GEN_INST_EXT)) {
 +        return;
 +    }
      /* patch the branch destination */
      uintptr_t addr = tb->jmp_target_addr[n];
      intptr_t disp = addr - (jmp_rx - 2);
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
  static void tcg_out_goto_tb(TCGContext *s, int which)
  {
 -    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
      /* indirect jump method. */
      tcg_out_op_p(s, INDEX_op_goto_tb, (void *)get_jmp_target_addr(s, which));
      set_jmp_reset_offset(s, which);
 --
 .34.1

-New patch
+[PULL 16/22] tcg/aarch64: Reorg goto_tb implementation
+The old implementation replaces two insns, swapping between
+    b    <dest>
+    nop
+    br    x30
+and
+    adrp    x30, <dest>
+    addi    x30, x30, lo12:<dest>
+    br    x30
+There is a race condition in which a thread could be stopped at
+the PC of the second insn, and when restarted does not see the
+complete address computation and branches to nowhere.
+The new implemetation replaces only one insn, swapping between
+    b    <dest>
+    br    tmp
+and
+    ldr    tmp, <jmp_addr>
+    br    tmp
+Reported-by: hev <r@hev.cc>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/aarch64/tcg-target.h     |  2 +-
+ tcg/aarch64/tcg-target.c.inc | 66 +++++++++++++++---------------------
+files changed, 29 insertions(+), 39 deletions(-)
+diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.h
++++ b/tcg/aarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@
+ #define TCG_TARGET_INSN_UNIT_SIZE  4
+ #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
+-#define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
++#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
+ typedef enum {
+     TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/aarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
+     tcg_out_call_int(s, target);
+ }
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+-{
+-    uintptr_t addr = tb->jmp_target_addr[n];
+-    tcg_insn_unit i1, i2;
+-    TCGType rt = TCG_TYPE_I64;
+-    TCGReg  rd = TCG_REG_TMP;
+-    uint64_t pair;
+-
+-    ptrdiff_t offset = addr - jmp_rx;
+-
+-    if (offset == sextract64(offset, 0, 26)) {
+-        i1 = I3206_B | ((offset >> 2) & 0x3ffffff);
+-        i2 = NOP;
+-    } else {
+-        offset = (addr >> 12) - (jmp_rx >> 12);
+-
+-        /* patch ADRP */
+-        i1 = I3406_ADRP | (offset & 3) << 29 | (offset & 0x1ffffc) << (5 - 2) | rd;
+-        /* patch ADDI */
+-        i2 = I3401_ADDI | rt << 31 | (addr & 0xfff) << 10 | rd << 5 | rd;
+-    }
+-    pair = (uint64_t)i2 << 32 | i1;
+-    qatomic_set((uint64_t *)jmp_rw, pair);
+-    flush_idcache_range(jmp_rx, jmp_rw, 8);
+-}
+-
+ static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
+ {
+     if (!l->has_value) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+ static void tcg_out_goto_tb(TCGContext *s, int which)
+ {
+     /*
+-     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
+-     * write can be used to patch the target address.
++     * Direct branch, or indirect address load, will be patched
++     * by tb_target_set_jmp_target.  Assert indirect load offset
++     * in range early, regardless of direct branch distance.
+      */
+-    if ((uintptr_t)s->code_ptr & 7) {
+-        tcg_out32(s, NOP);
+-    }
++    intptr_t i_off = tcg_pcrel_diff(s, (void *)get_jmp_target_addr(s, which));
++    tcg_debug_assert(i_off == sextract64(i_off, 0, 21));
++
+     set_jmp_insn_offset(s, which);
+-    /*
+-     * actual branch destination will be patched by
+-     * tb_target_set_jmp_target later
+-     */
+-    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
+-    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
++    tcg_out32(s, I3206_B);
+     tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
+     set_jmp_reset_offset(s, which);
+ }
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
++{
++    uintptr_t d_addr = tb->jmp_target_addr[n];
++    ptrdiff_t d_offset = d_addr - jmp_rx;
++    tcg_insn_unit insn;
++
++    /* Either directly branch, or indirect branch load. */
++    if (d_offset == sextract64(d_offset, 0, 28)) {
++        insn = deposit32(I3206_B, 0, 26, d_offset >> 2);
++    } else {
++        uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
++        ptrdiff_t i_offset = i_addr - jmp_rx;
++
++        /* Note that we asserted this in range in tcg_out_goto_tb. */
++        insn = deposit32(I3305_LDR | TCG_REG_TMP, 0, 5, i_offset >> 2);
++    }
++    qatomic_set((uint32_t *)jmp_rw, insn);
++    flush_idcache_range(jmp_rx, jmp_rw, 4);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+--
+.34.1

-New patch
+[PULL 17/22] tcg/ppc: Reorg goto_tb implementation
+The old ppc64 implementation replaces 2 or 4 insns, which leaves a race
 condition in which a thread could be stopped at a PC in the middle of
 the sequence, and when restarted does not see the complete address
 computation and branches to nowhere.
 The new implemetation replaces only one insn, swapping between
     b       <dest>
 and
     mtctr    r31
 falling through to a general-case indirect branch.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/ppc/tcg-target.h     |   3 +-
  tcg/ppc/tcg-target.c.inc | 158 +++++++++++----------------------------
 files changed, 44 insertions(+), 117 deletions(-)
 diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.h
 +++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@
  #ifdef _ARCH_PPC64
  # define TCG_TARGET_REG_BITS  64
 -# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
  #else
  # define TCG_TARGET_REG_BITS  32
 -# define MAX_CODE_GEN_BUFFER_SIZE  (32 * MiB)
  #endif
 +#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
  #define TCG_TARGET_NB_REGS 64
  #define TCG_TARGET_INSN_UNIT_SIZE 4
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
      tcg_out32(s, insn);
  }
 -static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2)
 -{
 -    if (HOST_BIG_ENDIAN) {
 -        return (uint64_t)i1 << 32 | i2;
 -    }
 -    return (uint64_t)i2 << 32 | i1;
 -}
 -
 -static inline void ppc64_replace2(uintptr_t rx, uintptr_t rw,
 -                                  tcg_insn_unit i0, tcg_insn_unit i1)
 -{
 -#if TCG_TARGET_REG_BITS == 64
 -    qatomic_set((uint64_t *)rw, make_pair(i0, i1));
 -    flush_idcache_range(rx, rw, 8);
 -#else
 -    qemu_build_not_reached();
 -#endif
 -}
 -
 -static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
 -                                  tcg_insn_unit i0, tcg_insn_unit i1,
 -                                  tcg_insn_unit i2, tcg_insn_unit i3)
 -{
 -    uint64_t p[2];
 -
 -    p[!HOST_BIG_ENDIAN] = make_pair(i0, i1);
 -    p[HOST_BIG_ENDIAN] = make_pair(i2, i3);
 -
 -    /*
 -     * There's no convenient way to get the compiler to allocate a pair
 -     * of registers at an even index, so copy into r6/r7 and clobber.
 -     */
 -    asm("mr  %%r6, %1\n\t"
 -        "mr  %%r7, %2\n\t"
 -        "stq %%r6, %0"
 -        : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7");
 -    flush_idcache_range(rx, rw, 16);
 -}
 -
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 -{
 -    tcg_insn_unit i0, i1, i2, i3;
 -    uintptr_t addr = tb->jmp_target_addr[n];
 -    intptr_t tb_diff = addr - (uintptr_t)tb->tc.ptr;
 -    intptr_t br_diff = addr - (jmp_rx + 4);
 -    intptr_t lo, hi;
 -
 -    if (TCG_TARGET_REG_BITS == 32) {
 -        intptr_t diff = addr - jmp_rx;
 -        tcg_debug_assert(in_range_b(diff));
 -        qatomic_set((uint32_t *)jmp_rw, B | (diff & 0x3fffffc));
 -        flush_idcache_range(jmp_rx, jmp_rw, 4);
 -        return;
 -    }
 -
 -    /*
 -     * For 16-bit displacements, we can use a single add + branch.
 -     * This happens quite often.
 -     */
 -    if (tb_diff == (int16_t)tb_diff) {
 -        i0 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
 -        i1 = B | (br_diff & 0x3fffffc);
 -        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
 -        return;
 -    }
 -
 -    lo = (int16_t)tb_diff;
 -    hi = (int32_t)(tb_diff - lo);
 -    assert(tb_diff == hi + lo);
 -    i0 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
 -    i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
 -
 -    /*
 -     * Without stq from 2.07, we can only update two insns,
 -     * and those must be the ones that load the target address.
 -     */
 -    if (!have_isa_2_07) {
 -        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
 -        return;
 -    }
 -
 -    /*
 -     * For 26-bit displacements, we can use a direct branch.
 -     * Otherwise we still need the indirect branch, which we
 -     * must restore after a potential direct branch write.
 -     */
 -    br_diff -= 4;
 -    if (in_range_b(br_diff)) {
 -        i2 = B | (br_diff & 0x3fffffc);
 -        i3 = NOP;
 -    } else {
 -        i2 = MTSPR | RS(TCG_REG_TB) | CTR;
 -        i3 = BCCTR | BO_ALWAYS;
 -    }
 -    ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3);
 -}
 -
  static void tcg_out_call_int(TCGContext *s, int lk,
                               const tcg_insn_unit *target)
  {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
  static void tcg_out_goto_tb(TCGContext *s, int which)
  {
 -    /* Direct jump. */
 -    if (TCG_TARGET_REG_BITS == 64) {
 -        /* Ensure the next insns are 8 or 16-byte aligned. */
 -        while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
 -            tcg_out32(s, NOP);
 -        }
 +    uintptr_t ptr = get_jmp_target_addr(s, which);
 +
 +    if (USE_REG_TB) {
 +        ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
 +        tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
 +
 +        /* Direct branch will be patched by tb_target_set_jmp_target. */
          set_jmp_insn_offset(s, which);
 -        tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
 -        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
          tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
 +
 +        /* When branch is out of range, fall through to indirect. */
 +        tcg_out32(s, BCCTR | BO_ALWAYS);
 +
 +        /* For the unlinked case, need to reset TCG_REG_TB.  */
 +        set_jmp_reset_offset(s, which);
 +        tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
 +                         -tcg_current_code_size(s));
 +    } else {
 +        /* Direct branch will be patched by tb_target_set_jmp_target. */
 +        set_jmp_insn_offset(s, which);
 +        tcg_out32(s, NOP);
 +
 +        /* When branch is out of range, fall through to indirect. */
 +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
 +        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
 +        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
          tcg_out32(s, BCCTR | BO_ALWAYS);
          set_jmp_reset_offset(s, which);
 -        if (USE_REG_TB) {
 -            /* For the unlinked case, need to reset TCG_REG_TB.  */
 -            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
 -                             -tcg_current_code_size(s));
 -        }
 -    } else {
 -        set_jmp_insn_offset(s, which);
 -        tcg_out32(s, B);
 -        set_jmp_reset_offset(s, which);
      }
  }
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
 +    uintptr_t addr = tb->jmp_target_addr[n];
 +    intptr_t diff = addr - jmp_rx;
 +    tcg_insn_unit insn;
 +
 +    if (in_range_b(diff)) {
 +        insn = B | (diff & 0x3fffffc);
 +    } else if (USE_REG_TB) {
 +        insn = MTSPR | RS(TCG_REG_TB) | CTR;
 +    } else {
 +        insn = NOP;
 +    }
 +
 +    qatomic_set((uint32_t *)jmp_rw, insn);
 +    flush_idcache_range(jmp_rx, jmp_rw, 4);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
 --
 .34.1

-New patch
+[PULL 18/22] tcg/sparc64: Remove USE_REG_TB
+This is always true for sparc64, so this is dead since 3a5f6805c7ca.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/sparc64/tcg-target.c.inc | 62 ++++++++++++------------------------
+file changed, 21 insertions(+), 41 deletions(-)
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc64/tcg-target.c.inc
++++ b/tcg/sparc64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
+ #endif
+ #define TCG_REG_TB  TCG_REG_I1
+-#define USE_REG_TB  (sizeof(void *) > 4)
+ static const int tcg_target_reg_alloc_order[] = {
+     TCG_REG_L0,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
+     }
+     /* A 13-bit constant relative to the TB.  */
+-    if (!in_prologue && USE_REG_TB) {
++    if (!in_prologue) {
+         test = tcg_tbrel_diff(s, (void *)arg);
+         if (check_fit_ptr(test, 13)) {
+             tcg_out_arithi(s, ret, TCG_REG_TB, test, ARITH_ADD);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
+     }
+     /* Use the constant pool, if possible. */
+-    if (!in_prologue && USE_REG_TB) {
++    if (!in_prologue) {
+         new_pool_label(s, arg, R_SPARC_13, s->code_ptr,
+                        tcg_tbrel_diff(s, NULL));
+         tcg_out32(s, LDX | INSN_RD(ret) | INSN_RS1(TCG_REG_TB));
+@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
+ #endif
+     /* We choose TCG_REG_TB such that no move is required.  */
+-    if (USE_REG_TB) {
+-        QEMU_BUILD_BUG_ON(TCG_REG_TB != TCG_REG_I1);
+-        tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);
+-    }
++    QEMU_BUILD_BUG_ON(TCG_REG_TB != TCG_REG_I1);
++    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);
+     tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I1, 0, JMPL);
+     /* delay slot */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+         tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+         tcg_out_movi_imm13(s, TCG_REG_O0, a0);
+         return;
+-    } else if (USE_REG_TB) {
++    } else {
+         intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
+         if (check_fit_ptr(tb_diff, 13)) {
+             tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+ static void tcg_out_goto_tb(TCGContext *s, int which)
+ {
++    int c;
++
+     /* Direct jump. */
+-    if (USE_REG_TB) {
+-        /* make sure the patch is 8-byte aligned.  */
+-        if ((intptr_t)s->code_ptr & 4) {
+-            tcg_out_nop(s);
+-        }
+-        set_jmp_insn_offset(s, which);
+-        tcg_out_sethi(s, TCG_REG_T1, 0);
+-        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+-        tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+-        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+-    } else {
+-        set_jmp_insn_offset(s, which);
+-        tcg_out32(s, CALL);
++    /* make sure the patch is 8-byte aligned.  */
++    if ((intptr_t)s->code_ptr & 4) {
+         tcg_out_nop(s);
+     }
++    set_jmp_insn_offset(s, which);
++    tcg_out_sethi(s, TCG_REG_T1, 0);
++    tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
++    tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
++    tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+     set_jmp_reset_offset(s, which);
+     /*
+      * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
+      * to the beginning of this TB.
+      */
+-    if (USE_REG_TB) {
+-        int c = -tcg_current_code_size(s);
+-        if (check_fit_i32(c, 13)) {
+-            tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
+-            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+-        }
++    c = -tcg_current_code_size(s);
++    if (check_fit_i32(c, 13)) {
++        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
++    } else {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
++        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_ptr:
+         tcg_out_arithi(s, TCG_REG_G0, a0, 0, JMPL);
+-        if (USE_REG_TB) {
+-            tcg_out_mov_delay(s, TCG_REG_TB, a0);
+-        } else {
+-            tcg_out_nop(s);
+-        }
++        tcg_out_mov_delay(s, TCG_REG_TB, a0);
+         break;
+     case INDEX_op_br:
+         tcg_out_bpcc(s, COND_A, BPCC_PT, arg_label(a0));
+@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+     tcg_debug_assert(tb_disp == (int32_t)tb_disp);
+     tcg_debug_assert(br_disp == (int32_t)br_disp);
+-    if (!USE_REG_TB) {
+-        qatomic_set((uint32_t *)jmp_rw,
+-            deposit32(CALL, 0, 30, br_disp >> 2));
+-        flush_idcache_range(jmp_rx, jmp_rw, 4);
+-        return;
+-    }
+-
+     /* This does not exercise the range of the branch, but we do
+        still need to be able to load the new value of TCG_REG_TB.
+        But this does still happen quite often.  */
+--
+.34.1

-New patch
+[PULL 19/22] tcg/sparc64: Reorg goto_tb implementation
+The old sparc64 implementation may replace two insns, which leaves
+a race condition in which a thread could be stopped at a PC in the
+middle of the sequence, and when restarted does not see the complete
+address computation and branches to nowhere.
+The new implemetation replaces only one insn, swapping between a
+direct branch and a direct call.  The TCG_REG_TB register is loaded
+from tb->jmp_target_addr[] in the delay slot.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/sparc64/tcg-target.c.inc | 87 +++++++++++++++---------------------
+file changed, 37 insertions(+), 50 deletions(-)
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc64/tcg-target.c.inc
++++ b/tcg/sparc64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+ static void tcg_out_goto_tb(TCGContext *s, int which)
+ {
+-    int c;
++    ptrdiff_t off = tcg_tbrel_diff(s, (void *)get_jmp_target_addr(s, which));
+-    /* Direct jump. */
+-    /* make sure the patch is 8-byte aligned.  */
+-    if ((intptr_t)s->code_ptr & 4) {
+-        tcg_out_nop(s);
+-    }
++    /* Direct branch will be patched by tb_target_set_jmp_target. */
+     set_jmp_insn_offset(s, which);
+-    tcg_out_sethi(s, TCG_REG_T1, 0);
+-    tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+-    tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+-    tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
++    tcg_out32(s, CALL);
++    /* delay slot */
++    tcg_debug_assert(check_fit_ptr(off, 13));
++    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, TCG_REG_TB, off);
+     set_jmp_reset_offset(s, which);
+     /*
+      * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
+      * to the beginning of this TB.
+      */
+-    c = -tcg_current_code_size(s);
+-    if (check_fit_i32(c, 13)) {
+-        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
++    off = -tcg_current_code_size(s);
++    if (check_fit_i32(off, 13)) {
++        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, off, ARITH_ADD);
+     } else {
+-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, off);
+         tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+     }
+ }
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
++{
++    uintptr_t addr = tb->jmp_target_addr[n];
++    intptr_t br_disp = (intptr_t)(addr - jmp_rx) >> 2;
++    tcg_insn_unit insn;
++
++    br_disp >>= 2;
++    if (check_fit_ptr(br_disp, 19)) {
++        /* ba,pt %icc, addr */
++        insn = deposit32(INSN_OP(0) | INSN_OP2(1) | INSN_COND(COND_A)
++                         | BPCC_ICC | BPCC_PT, 0, 19, br_disp);
++    } else if (check_fit_ptr(br_disp, 22)) {
++        /* ba addr */
++        insn = deposit32(INSN_OP(0) | INSN_OP2(2) | INSN_COND(COND_A),
++                         0, 22, br_disp);
++    } else {
++        /* The code_gen_buffer can't be larger than 2GB.  */
++        tcg_debug_assert(check_fit_ptr(br_disp, 30));
++        /* call addr */
++        insn = deposit32(CALL, 0, 30, br_disp);
++    }
++
++    qatomic_set((uint32_t *)jmp_rw, insn);
++    flush_idcache_range(jmp_rx, jmp_rw, 4);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ void tcg_register_jit(const void *buf, size_t buf_size)
+ {
+     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
+ }
+-
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+-{
+-    uintptr_t addr = tb->jmp_target_addr[n];
+-    intptr_t tb_disp = addr - (uintptr_t)tb->tc.ptr;
+-    intptr_t br_disp = addr - jmp_rx;
+-    tcg_insn_unit i1, i2;
+-
+-    /* We can reach the entire address space for ILP32.
+-       For LP64, the code_gen_buffer can't be larger than 2GB.  */
+-    tcg_debug_assert(tb_disp == (int32_t)tb_disp);
+-    tcg_debug_assert(br_disp == (int32_t)br_disp);
+-
+-    /* This does not exercise the range of the branch, but we do
+-       still need to be able to load the new value of TCG_REG_TB.
+-       But this does still happen quite often.  */
+-    if (check_fit_ptr(tb_disp, 13)) {
+-        /* ba,pt %icc, addr */
+-        i1 = (INSN_OP(0) | INSN_OP2(1) | INSN_COND(COND_A)
+-              | BPCC_ICC | BPCC_PT | INSN_OFF19(br_disp));
+-        i2 = (ARITH_ADD | INSN_RD(TCG_REG_TB) | INSN_RS1(TCG_REG_TB)
+-              | INSN_IMM13(tb_disp));
+-    } else if (tb_disp >= 0) {
+-        i1 = SETHI | INSN_RD(TCG_REG_T1) | ((tb_disp & 0xfffffc00) >> 10);
+-        i2 = (ARITH_OR | INSN_RD(TCG_REG_T1) | INSN_RS1(TCG_REG_T1)
+-              | INSN_IMM13(tb_disp & 0x3ff));
+-    } else {
+-        i1 = SETHI | INSN_RD(TCG_REG_T1) | ((~tb_disp & 0xfffffc00) >> 10);
+-        i2 = (ARITH_XOR | INSN_RD(TCG_REG_T1) | INSN_RS1(TCG_REG_T1)
+-              | INSN_IMM13((tb_disp & 0x3ff) | -0x400));
+-    }
+-
+-    qatomic_set((uint64_t *)jmp_rw, deposit64(i2, 32, 32, i1));
+-    flush_idcache_range(jmp_rx, jmp_rw, 8);
+-}
+--
+.34.1

-[PULL 2/3] accel/tcg: split tcg_start_vcpu_thread
+[PULL 20/22] tcg/arm: Implement direct branch for goto_tb
-From: Claudio Fontana <cfontana@suse.de>
+Now that tcg can handle direct and indirect goto_tb
 simultaneously, we can optimistically leave space for
 a direct branch and fall back to loading the pointer
 from the TB for an indirect branch.
-after the initial split into 3 tcg variants, we proceed to also
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 split tcg_start_vcpu_thread.
 We actually split it in 2 this time, since the icount variant
 just uses the round robin function.
 Suggested-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Message-Id: <20201015143217.29337-3-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-cpus-mttcg.h  | 21 --------------
+ tcg/arm/tcg-target.c.inc | 52 ++++++++++++++++++++++++++++------------
- accel/tcg/tcg-cpus-rr.h     |  3 +-
+file changed, 37 insertions(+), 15 deletions(-)
  accel/tcg/tcg-cpus.h        |  1 -
  accel/tcg/tcg-all.c         |  5 ++++
  accel/tcg/tcg-cpus-icount.c |  2 +-
  accel/tcg/tcg-cpus-mttcg.c  | 29 +++++++++++++++++--
  accel/tcg/tcg-cpus-rr.c     | 39 +++++++++++++++++++++++--
  accel/tcg/tcg-cpus.c        | 58 -------------------------------------
 files changed, 71 insertions(+), 87 deletions(-)
  delete mode 100644 accel/tcg/tcg-cpus-mttcg.h
-diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 deleted file mode 100644
 index XXXXXXX..XXXXXXX
 --- a/accel/tcg/tcg-cpus-mttcg.h
 +++ /dev/null
@@ -XXX,XX +XXX,XX @@
 -/*
 - * QEMU TCG Multi Threaded vCPUs implementation
 - *
 - * Copyright 2020 SUSE LLC
 - *
 - * This work is licensed under the terms of the GNU GPL, version 2 or later.
 - * See the COPYING file in the top-level directory.
 - */
 -
 -#ifndef TCG_CPUS_MTTCG_H
 -#define TCG_CPUS_MTTCG_H
 -
 -/*
 - * In the multi-threaded case each vCPU has its own thread. The TLS
 - * variable current_cpu can be used deep in the code to find the
 - * current CPUState for a given thread.
 - */
 -
 -void *tcg_cpu_thread_fn(void *arg);
 -
 -#endif /* TCG_CPUS_MTTCG_H */
 diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-cpus-rr.h
+--- a/tcg/arm/tcg-target.c.inc
-+++ b/accel/tcg/tcg-cpus-rr.h
++++ b/tcg/arm/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ typedef enum {
- /* Kick all RR vCPUs. */
+     ARITH_BIC = 0xe << 21,
- void qemu_cpu_kick_rr_cpus(CPUState *unused);
+     ARITH_MVN = 0xf << 21,
--void *tcg_rr_cpu_thread_fn(void *arg);
++    INSN_B         = 0x0a000000,
 +/* start the round robin vcpu thread */
 +void rr_start_vcpu_thread(CPUState *cpu);
  #endif /* TCG_CPUS_RR_H */
 diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.h
 +++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
  extern const CpusAccel tcg_cpus_icount;
  extern const CpusAccel tcg_cpus_rr;
 -void tcg_start_vcpu_thread(CPUState *cpu);
  void qemu_tcg_destroy_vcpu(CPUState *cpu);
  int tcg_cpu_exec(CPUState *cpu);
  void tcg_handle_interrupt(CPUState *cpu, int mask);
 diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-all.c
 +++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
      tcg_exec_init(s->tb_size * 1024 * 1024);
      mttcg_enabled = s->mttcg_enabled;
 +    /*
 +     * Initialize TCG regions
 +     */
 +    tcg_region_init();
 +
-     if (mttcg_enabled) {
+     INSN_CLZ       = 0x016f0f10,
-         cpus_register_accel(&tcg_cpus_mttcg);
+     INSN_RBIT      = 0x06ff0f30,
-     } else if (icount_enabled()) {
-diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
+@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-cpus-icount.c
+ static void tcg_out_b_imm(TCGContext *s, ARMCond cond, int32_t offset)
-+++ b/accel/tcg/tcg-cpus-icount.c
+ {
-@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
+-    tcg_out32(s, (cond << 28) | 0x0a000000 |
 +    tcg_out32(s, (cond << 28) | INSN_B |
                      (((offset - 8) >> 2) & 0x00ffffff));
  }
- const CpusAccel tcg_cpus_icount = {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
--    .create_vcpu_thread = tcg_start_vcpu_thread,
-+    .create_vcpu_thread = rr_start_vcpu_thread,
+ static void tcg_out_goto_tb(TCGContext *s, int which)
      .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
      .handle_interrupt = icount_handle_interrupt,
 diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-mttcg.c
 +++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/boards.h"
  #include "tcg-cpus.h"
 -#include "tcg-cpus-mttcg.h"
  /*
   * In the multi-threaded case each vCPU has its own thread. The TLS
@@ -XXX,XX +XXX,XX @@
   * current CPUState for a given thread.
   */
 -void *tcg_cpu_thread_fn(void *arg)
 +static void *tcg_cpu_thread_fn(void *arg)
  {
-     CPUState *cpu = arg;
+-    /* Indirect jump method */
+-    intptr_t ptr, dif, dil;
-@@ -XXX,XX +XXX,XX @@ static void mttcg_kick_vcpu_thread(CPUState *cpu)
+-    TCGReg base = TCG_REG_PC;
-     cpu_exit(cpu);
++    uintptr_t i_addr;
 +    intptr_t i_disp;
 -    ptr = get_jmp_target_addr(s, which);
 -    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
 -    dil = sextract32(dif, 0, 12);
 -    if (dif != dil) {
 +    /* Direct branch will be patched by tb_target_set_jmp_target. */
 +    set_jmp_insn_offset(s, which);
 +    tcg_out32(s, INSN_NOP);
 +
 +    /* When branch is out of range, fall through to indirect. */
 +    i_addr = get_jmp_target_addr(s, which);
 +    i_disp = tcg_pcrel_diff(s, (void *)i_addr) - 8;
 +    tcg_debug_assert(i_disp < 0);
 +    if (i_disp >= -0xfff) {
 +        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, i_disp);
 +    } else {
          /*
           * The TB is close, but outside the 12 bits addressable by
           * the load.  We can extend this to 20 bits with a sub of a
 -         * shifted immediate from pc.  In the vastly unlikely event
 -         * the code requires more than 1MB, we'll use 2 insns and
 -         * be no worse off.
 +         * shifted immediate from pc.
           */
 -        base = TCG_REG_R0;
 -        tcg_out_movi32(s, COND_AL, base, ptr - dil);
 +        int h = -i_disp;
 +        int l = h & 0xfff;
 +
 +        h = encode_imm_nofail(h - l);
 +        tcg_out_dat_imm(s, COND_AL, ARITH_SUB, TCG_REG_R0, TCG_REG_PC, h);
 +        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, l);
      }
 -    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
      set_jmp_reset_offset(s, which);
  }
-+static void mttcg_start_vcpu_thread(CPUState *cpu)
+ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-+{
+                               uintptr_t jmp_rx, uintptr_t jmp_rw)
-+    char thread_name[VCPU_THREAD_NAME_SIZE];
+ {
 -    /* Always indirect, nothing to do */
 +    uintptr_t addr = tb->jmp_target_addr[n];
 +    ptrdiff_t offset = addr - (jmp_rx + 8);
 +    tcg_insn_unit insn;
 +
-+    g_assert(tcg_enabled());
++    /* Either directly branch, or fall through to indirect branch. */
 +    if (offset == sextract64(offset, 0, 26)) {
 +        /* B <addr> */
 +        insn = deposit32((COND_AL << 28) | INSN_B, 0, 24, offset >> 2);
 +    } else {
 +        insn = INSN_NOP;
 +    }
 +
-+    parallel_cpus = (current_machine->smp.max_cpus > 1);
++    qatomic_set((uint32_t *)jmp_rw, insn);
-+
++    flush_idcache_range(jmp_rx, jmp_rw, 4);
 +    cpu->thread = g_malloc0(sizeof(QemuThread));
 +    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 +    qemu_cond_init(cpu->halt_cond);
 +
 +    /* create a thread per vCPU with TCG (MTTCG) */
 +    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
 +             cpu->cpu_index);
 +
 +    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
 +                       cpu, QEMU_THREAD_JOINABLE);
 +
 +#ifdef _WIN32
 +    cpu->hThread = qemu_thread_get_handle(cpu->thread);
 +#endif
 +}
 +
  const CpusAccel tcg_cpus_mttcg = {
 -    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .create_vcpu_thread = mttcg_start_vcpu_thread,
      .kick_vcpu_thread = mttcg_kick_vcpu_thread,
      .handle_interrupt = tcg_handle_interrupt,
 diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-rr.c
 +++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
   * elsewhere.
   */
 -void *tcg_rr_cpu_thread_fn(void *arg)
 +static void *tcg_rr_cpu_thread_fn(void *arg)
  {
      CPUState *cpu = arg;
@@ -XXX,XX +XXX,XX @@ void *tcg_rr_cpu_thread_fn(void *arg)
      return NULL;
  }
-+void rr_start_vcpu_thread(CPUState *cpu)
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 +{
 +    char thread_name[VCPU_THREAD_NAME_SIZE];
 +    static QemuCond *single_tcg_halt_cond;
 +    static QemuThread *single_tcg_cpu_thread;
 +
 +    g_assert(tcg_enabled());
 +    parallel_cpus = false;
 +
 +    if (!single_tcg_cpu_thread) {
 +        cpu->thread = g_malloc0(sizeof(QemuThread));
 +        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 +        qemu_cond_init(cpu->halt_cond);
 +
 +        /* share a single thread for all cpus with TCG */
 +        snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
 +        qemu_thread_create(cpu->thread, thread_name,
 +                           tcg_rr_cpu_thread_fn,
 +                           cpu, QEMU_THREAD_JOINABLE);
 +
 +        single_tcg_halt_cond = cpu->halt_cond;
 +        single_tcg_cpu_thread = cpu->thread;
 +#ifdef _WIN32
 +        cpu->hThread = qemu_thread_get_handle(cpu->thread);
 +#endif
 +    } else {
 +        /* we share the thread */
 +        cpu->thread = single_tcg_cpu_thread;
 +        cpu->halt_cond = single_tcg_halt_cond;
 +        cpu->thread_id = first_cpu->thread_id;
 +        cpu->can_do_io = 1;
 +        cpu->created = true;
 +    }
 +}
 +
  const CpusAccel tcg_cpus_rr = {
 -    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .create_vcpu_thread = rr_start_vcpu_thread,
      .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
      .handle_interrupt = tcg_handle_interrupt,
 diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.c
 +++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/boards.h"
  #include "tcg-cpus.h"
 -#include "tcg-cpus-mttcg.h"
 -#include "tcg-cpus-rr.h"
  /* common functionality among all TCG variants */
 -void tcg_start_vcpu_thread(CPUState *cpu)
 -{
 -    char thread_name[VCPU_THREAD_NAME_SIZE];
 -    static QemuCond *single_tcg_halt_cond;
 -    static QemuThread *single_tcg_cpu_thread;
 -    static int tcg_region_inited;
 -
 -    assert(tcg_enabled());
 -    /*
 -     * Initialize TCG regions--once. Now is a good time, because:
 -     * (1) TCG's init context, prologue and target globals have been set up.
 -     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
 -     *     -accel flag is processed, so the check doesn't work then).
 -     */
 -    if (!tcg_region_inited) {
 -        tcg_region_inited = 1;
 -        tcg_region_init();
 -        parallel_cpus = qemu_tcg_mttcg_enabled() && current_machine->smp.max_cpus > 1;
 -    }
 -
 -    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
 -        cpu->thread = g_malloc0(sizeof(QemuThread));
 -        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 -        qemu_cond_init(cpu->halt_cond);
 -
 -        if (qemu_tcg_mttcg_enabled()) {
 -            /* create a thread per vCPU with TCG (MTTCG) */
 -            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
 -                 cpu->cpu_index);
 -
 -            qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
 -                               cpu, QEMU_THREAD_JOINABLE);
 -
 -        } else {
 -            /* share a single thread for all cpus with TCG */
 -            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
 -            qemu_thread_create(cpu->thread, thread_name,
 -                               tcg_rr_cpu_thread_fn,
 -                               cpu, QEMU_THREAD_JOINABLE);
 -
 -            single_tcg_halt_cond = cpu->halt_cond;
 -            single_tcg_cpu_thread = cpu->thread;
 -        }
 -#ifdef _WIN32
 -        cpu->hThread = qemu_thread_get_handle(cpu->thread);
 -#endif
 -    } else {
 -        /* For non-MTTCG cases we share the thread */
 -        cpu->thread = single_tcg_cpu_thread;
 -        cpu->halt_cond = single_tcg_halt_cond;
 -        cpu->thread_id = first_cpu->thread_id;
 -        cpu->can_do_io = 1;
 -        cpu->created = true;
 -    }
 -}
 -
  void qemu_tcg_destroy_vcpu(CPUState *cpu)
  {
      cpu_thread_signal_destroyed(cpu);
 --
-.25.1
+.34.1

-New patch
+[PULL 21/22] tcg/riscv: Introduce OPC_NOP
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/riscv/tcg-target.c.inc | 3 ++-
+file changed, 2 insertions(+), 1 deletion(-)
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/riscv/tcg-target.c.inc
++++ b/tcg/riscv/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #endif
+     OPC_FENCE = 0x0000000f,
++    OPC_NOP   = OPC_ADDI,   /* nop = addi r0,r0,0 */
+ } RISCVInsn;
+ /*
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
+ {
+     int i;
+     for (i = 0; i < count; ++i) {
+-        p[i] = encode_i(OPC_ADDI, TCG_REG_ZERO, TCG_REG_ZERO, 0);
++        p[i] = OPC_NOP;
+     }
+ }
+--
+.34.1

-New patch
+[PULL 22/22] tcg/riscv: Implement direct branch for goto_tb
+Now that tcg can handle direct and indirect goto_tb simultaneously,
+we can optimistically leave space for a direct branch and fall back
+to loading the pointer from the TB for an indirect branch.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/riscv/tcg-target.c.inc | 19 +++++++++++++++++--
+file changed, 17 insertions(+), 2 deletions(-)
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/riscv/tcg-target.c.inc
++++ b/tcg/riscv/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+ static void tcg_out_goto_tb(TCGContext *s, int which)
+ {
+-    /* indirect jump method */
++    /* Direct branch will be patched by tb_target_set_jmp_target. */
++    set_jmp_insn_offset(s, which);
++    tcg_out32(s, OPC_JAL);
++
++    /* When branch is out of range, fall through to indirect. */
+     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
+                get_jmp_target_addr(s, which));
+     tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
+ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                               uintptr_t jmp_rx, uintptr_t jmp_rw)
+ {
+-    /* Always indirect, nothing to do */
++    uintptr_t addr = tb->jmp_target_addr[n];
++    ptrdiff_t offset = addr - jmp_rx;
++    tcg_insn_unit insn;
++
++    /* Either directly branch, or fall through to indirect branch. */
++    if (offset == sextreg(offset, 0, 20)) {
++        insn = encode_uj(OPC_JAL, TCG_REG_ZERO, offset);
++    } else {
++        insn = OPC_NOP;
++    }
++    qatomic_set((uint32_t *)jmp_rw, insn);
++    flush_idcache_range(jmp_rx, jmp_rw, 4);
+ }
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+--
+.34.1

The following changes since commit 2ecfc0657afa5d29a373271b342f704a1a3c6737:

Merge remote-tracking branch 'remotes/armbru/tags/pull-misc-2020-12-10' into staging (2020-12-10 17:01:05 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20201210

for you to fetch changes up to 9e2658d62ebc23efe7df43fc0e306f129510d874:

accel/tcg: rename tcg-cpus functions to match module name (2020-12-10 17:44:10 -0600)

----------------------------------------------------------------
Split CpusAccel for tcg variants

----------------------------------------------------------------
Claudio Fontana (3):
      accel/tcg: split CpusAccel into three TCG variants
      accel/tcg: split tcg_start_vcpu_thread
      accel/tcg: rename tcg-cpus functions to match module name

accel/tcg/tcg-cpus-icount.h |  17 ++
 accel/tcg/tcg-cpus-rr.h     |  21 ++
 accel/tcg/tcg-cpus.h        |  12 +-
 accel/tcg/tcg-all.c         |  13 +-
 accel/tcg/tcg-cpus-icount.c | 147 +++++++++++++
 accel/tcg/tcg-cpus-mttcg.c  | 140 ++++++++++++
 accel/tcg/tcg-cpus-rr.c     | 305 ++++++++++++++++++++++++++
 accel/tcg/tcg-cpus.c        | 506 +-------------------------------------------
 softmmu/icount.c            |   2 +-
 accel/tcg/meson.build       |   9 +-
 10 files changed, 670 insertions(+), 502 deletions(-)
 create mode 100644 accel/tcg/tcg-cpus-icount.h
 create mode 100644 accel/tcg/tcg-cpus-rr.h
 create mode 100644 accel/tcg/tcg-cpus-icount.c
 create mode 100644 accel/tcg/tcg-cpus-mttcg.c
 create mode 100644 accel/tcg/tcg-cpus-rr.c

From: Claudio Fontana <cfontana@suse.de>

split up the CpusAccel tcg_cpus into three TCG variants:

tcg_cpus_rr (single threaded, round robin cpus)
tcg_cpus_icount (same as rr, but with instruction counting enabled)
tcg_cpus_mttcg (multi-threaded cpus)

Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20201015143217.29337-2-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-cpus-icount.h |  17 ++
 accel/tcg/tcg-cpus-mttcg.h  |  21 ++
 accel/tcg/tcg-cpus-rr.h     |  20 ++
 accel/tcg/tcg-cpus.h        |  13 +-
 accel/tcg/tcg-all.c         |   8 +-
 accel/tcg/tcg-cpus-icount.c | 147 +++++++++++
 accel/tcg/tcg-cpus-mttcg.c  | 117 +++++++++
 accel/tcg/tcg-cpus-rr.c     | 270 ++++++++++++++++++++
 accel/tcg/tcg-cpus.c        | 484 ++----------------------------------
 softmmu/icount.c            |   2 +-
 accel/tcg/meson.build       |   9 +-
 11 files changed, 646 insertions(+), 462 deletions(-)
 create mode 100644 accel/tcg/tcg-cpus-icount.h
 create mode 100644 accel/tcg/tcg-cpus-mttcg.h
 create mode 100644 accel/tcg/tcg-cpus-rr.h
 create mode 100644 accel/tcg/tcg-cpus-icount.c
 create mode 100644 accel/tcg/tcg-cpus-mttcg.c
 create mode 100644 accel/tcg/tcg-cpus-rr.c

diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-icount.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation using instruction counting
+ *
+ * Copyright 2020 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPUS_ICOUNT_H
+#define TCG_CPUS_ICOUNT_H
+
+void handle_icount_deadline(void);
+void prepare_icount_for_run(CPUState *cpu);
+void process_icount_data(CPUState *cpu);
+
+#endif /* TCG_CPUS_ICOUNT_H */
diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-mttcg.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Multi Threaded vCPUs implementation
+ *
+ * Copyright 2020 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPUS_MTTCG_H
+#define TCG_CPUS_MTTCG_H
+
+/*
+ * In the multi-threaded case each vCPU has its own thread. The TLS
+ * variable current_cpu can be used deep in the code to find the
+ * current CPUState for a given thread.
+ */
+
+void *tcg_cpu_thread_fn(void *arg);
+
+#endif /* TCG_CPUS_MTTCG_H */
diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation
+ *
+ * Copyright 2020 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPUS_RR_H
+#define TCG_CPUS_RR_H
+
+#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+
+/* Kick all RR vCPUs. */
+void qemu_cpu_kick_rr_cpus(CPUState *unused);
+
+void *tcg_rr_cpu_thread_fn(void *arg);
+
+#endif /* TCG_CPUS_RR_H */
diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.h
+++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@
 /*
- * Accelerator CPUS Interface
+ * QEMU TCG vCPU common functionality
+ *
+ * Functionality common to all TCG vcpu variants: mttcg, rr and icount.
  *
  * Copyright 2020 SUSE LLC
  *
@@ -XXX,XX +XXX,XX @@
 
 #include "sysemu/cpus.h"
 
-extern const CpusAccel tcg_cpus;
+extern const CpusAccel tcg_cpus_mttcg;
+extern const CpusAccel tcg_cpus_icount;
+extern const CpusAccel tcg_cpus_rr;
+
+void tcg_start_vcpu_thread(CPUState *cpu);
+void qemu_tcg_destroy_vcpu(CPUState *cpu);
+int tcg_cpu_exec(CPUState *cpu);
+void tcg_handle_interrupt(CPUState *cpu, int mask);
 
 #endif /* TCG_CPUS_H */
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
 
     tcg_exec_init(s->tb_size * 1024 * 1024);
     mttcg_enabled = s->mttcg_enabled;
-    cpus_register_accel(&tcg_cpus);
 
+    if (mttcg_enabled) {
+        cpus_register_accel(&tcg_cpus_mttcg);
+    } else if (icount_enabled()) {
+        cpus_register_accel(&tcg_cpus_icount);
+    } else {
+        cpus_register_accel(&tcg_cpus_rr);
+    }
     return 0;
 }
 
diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation using instruction counting
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/tcg.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
+#include "exec/exec-all.h"
+#include "hw/boards.h"
+
+#include "tcg-cpus.h"
+#include "tcg-cpus-icount.h"
+#include "tcg-cpus-rr.h"
+
+static int64_t tcg_get_icount_limit(void)
+{
+    int64_t deadline;
+
+    if (replay_mode != REPLAY_MODE_PLAY) {
+        /*
+         * Include all the timers, because they may need an attention.
+         * Too long CPU execution may create unnecessary delay in UI.
+         */
+        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                              QEMU_TIMER_ATTR_ALL);
+        /* Check realtime timers, because they help with input processing */
+        deadline = qemu_soonest_timeout(deadline,
+                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
+                                           QEMU_TIMER_ATTR_ALL));
+
+        /*
+         * Maintain prior (possibly buggy) behaviour where if no deadline
+         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
+         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
+         * nanoseconds.
+         */
+        if ((deadline < 0) || (deadline > INT32_MAX)) {
+            deadline = INT32_MAX;
+        }
+
+        return icount_round(deadline);
+    } else {
+        return replay_get_instructions();
+    }
+}
+
+static void notify_aio_contexts(void)
+{
+    /* Wake up other AioContexts.  */
+    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
+}
+
+void handle_icount_deadline(void)
+{
+    assert(qemu_in_vcpu_thread());
+    int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                                  QEMU_TIMER_ATTR_ALL);
+
+    if (deadline == 0) {
+        notify_aio_contexts();
+    }
+}
+
+void prepare_icount_for_run(CPUState *cpu)
+{
+    int insns_left;
+
+    /*
+     * These should always be cleared by process_icount_data after
+     * each vCPU execution. However u16.high can be raised
+     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
+     */
+    g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
+    g_assert(cpu->icount_extra == 0);
+
+    cpu->icount_budget = tcg_get_icount_limit();
+    insns_left = MIN(0xffff, cpu->icount_budget);
+    cpu_neg(cpu)->icount_decr.u16.low = insns_left;
+    cpu->icount_extra = cpu->icount_budget - insns_left;
+
+    replay_mutex_lock();
+
+    if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
+        notify_aio_contexts();
+    }
+}
+
+void process_icount_data(CPUState *cpu)
+{
+    /* Account for executed instructions */
+    icount_update(cpu);
+
+    /* Reset the counters */
+    cpu_neg(cpu)->icount_decr.u16.low = 0;
+    cpu->icount_extra = 0;
+    cpu->icount_budget = 0;
+
+    replay_account_executed_instructions();
+
+    replay_mutex_unlock();
+}
+
+static void icount_handle_interrupt(CPUState *cpu, int mask)
+{
+    int old_mask = cpu->interrupt_request;
+
+    tcg_handle_interrupt(cpu, mask);
+    if (qemu_cpu_is_self(cpu) &&
+        !cpu->can_do_io
+        && (mask & ~old_mask) != 0) {
+        cpu_abort(cpu, "Raised interrupt while not in I/O function");
+    }
+}
+
+const CpusAccel tcg_cpus_icount = {
+    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+
+    .handle_interrupt = icount_handle_interrupt,
+    .get_virtual_clock = icount_get,
+    .get_elapsed_ticks = icount_get,
+};
diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Multi Threaded vCPUs implementation
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/tcg.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
+#include "exec/exec-all.h"
+#include "hw/boards.h"
+
+#include "tcg-cpus.h"
+#include "tcg-cpus-mttcg.h"
+
+/*
+ * In the multi-threaded case each vCPU has its own thread. The TLS
+ * variable current_cpu can be used deep in the code to find the
+ * current CPUState for a given thread.
+ */
+
+void *tcg_cpu_thread_fn(void *arg)
+{
+    CPUState *cpu = arg;
+
+    assert(tcg_enabled());
+    g_assert(!icount_enabled());
+
+    rcu_register_thread();
+    tcg_register_thread();
+
+    qemu_mutex_lock_iothread();
+    qemu_thread_get_self(cpu->thread);
+
+    cpu->thread_id = qemu_get_thread_id();
+    cpu->can_do_io = 1;
+    current_cpu = cpu;
+    cpu_thread_signal_created(cpu);
+    qemu_guest_random_seed_thread_part2(cpu->random_seed);
+
+    /* process any pending work */
+    cpu->exit_request = 1;
+
+    do {
+        if (cpu_can_run(cpu)) {
+            int r;
+            qemu_mutex_unlock_iothread();
+            r = tcg_cpu_exec(cpu);
+            qemu_mutex_lock_iothread();
+            switch (r) {
+            case EXCP_DEBUG:
+                cpu_handle_guest_debug(cpu);
+                break;
+            case EXCP_HALTED:
+                /*
+                 * during start-up the vCPU is reset and the thread is
+                 * kicked several times. If we don't ensure we go back
+                 * to sleep in the halted state we won't cleanly
+                 * start-up when the vCPU is enabled.
+                 *
+                 * cpu->halted should ensure we sleep in wait_io_event
+                 */
+                g_assert(cpu->halted);
+                break;
+            case EXCP_ATOMIC:
+                qemu_mutex_unlock_iothread();
+                cpu_exec_step_atomic(cpu);
+                qemu_mutex_lock_iothread();
+            default:
+                /* Ignore everything else? */
+                break;
+            }
+        }
+
+        qatomic_mb_set(&cpu->exit_request, 0);
+        qemu_wait_io_event(cpu);
+    } while (!cpu->unplug || cpu_can_run(cpu));
+
+    qemu_tcg_destroy_vcpu(cpu);
+    qemu_mutex_unlock_iothread();
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void mttcg_kick_vcpu_thread(CPUState *cpu)
+{
+    cpu_exit(cpu);
+}
+
+const CpusAccel tcg_cpus_mttcg = {
+    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .kick_vcpu_thread = mttcg_kick_vcpu_thread,
+
+    .handle_interrupt = tcg_handle_interrupt,
+};
diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/tcg.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
+#include "exec/exec-all.h"
+#include "hw/boards.h"
+
+#include "tcg-cpus.h"
+#include "tcg-cpus-rr.h"
+#include "tcg-cpus-icount.h"
+
+/* Kick all RR vCPUs */
+void qemu_cpu_kick_rr_cpus(CPUState *unused)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        cpu_exit(cpu);
+    };
+}
+
+/*
+ * TCG vCPU kick timer
+ *
+ * The kick timer is responsible for moving single threaded vCPU
+ * emulation on to the next vCPU. If more than one vCPU is running a
+ * timer event with force a cpu->exit so the next vCPU can get
+ * scheduled.
+ *
+ * The timer is removed if all vCPUs are idle and restarted again once
+ * idleness is complete.
+ */
+
+static QEMUTimer *tcg_kick_vcpu_timer;
+static CPUState *tcg_current_rr_cpu;
+
+#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+
+static inline int64_t qemu_tcg_next_kick(void)
+{
+    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
+}
+
+/* Kick the currently round-robin scheduled vCPU to next */
+static void qemu_cpu_kick_rr_next_cpu(void)
+{
+    CPUState *cpu;
+    do {
+        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
+        if (cpu) {
+            cpu_exit(cpu);
+        }
+    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
+}
+
+static void kick_tcg_thread(void *opaque)
+{
+    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    qemu_cpu_kick_rr_next_cpu();
+}
+
+static void start_tcg_kick_timer(void)
+{
+    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
+        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                                           kick_tcg_thread, NULL);
+    }
+    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
+        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    }
+}
+
+static void stop_tcg_kick_timer(void)
+{
+    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
+        timer_del(tcg_kick_vcpu_timer);
+    }
+}
+
+static void qemu_tcg_rr_wait_io_event(void)
+{
+    CPUState *cpu;
+
+    while (all_cpu_threads_idle()) {
+        stop_tcg_kick_timer();
+        qemu_cond_wait_iothread(first_cpu->halt_cond);
+    }
+
+    start_tcg_kick_timer();
+
+    CPU_FOREACH(cpu) {
+        qemu_wait_io_event_common(cpu);
+    }
+}
+
+/*
+ * Destroy any remaining vCPUs which have been unplugged and have
+ * finished running
+ */
+static void deal_with_unplugged_cpus(void)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        if (cpu->unplug && !cpu_can_run(cpu)) {
+            qemu_tcg_destroy_vcpu(cpu);
+            break;
+        }
+    }
+}
+
+/*
+ * In the single-threaded case each vCPU is simulated in turn. If
+ * there is more than a single vCPU we create a simple timer to kick
+ * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
+ * This is done explicitly rather than relying on side-effects
+ * elsewhere.
+ */
+
+void *tcg_rr_cpu_thread_fn(void *arg)
+{
+    CPUState *cpu = arg;
+
+    assert(tcg_enabled());
+    rcu_register_thread();
+    tcg_register_thread();
+
+    qemu_mutex_lock_iothread();
+    qemu_thread_get_self(cpu->thread);
+
+    cpu->thread_id = qemu_get_thread_id();
+    cpu->can_do_io = 1;
+    cpu_thread_signal_created(cpu);
+    qemu_guest_random_seed_thread_part2(cpu->random_seed);
+
+    /* wait for initial kick-off after machine start */
+    while (first_cpu->stopped) {
+        qemu_cond_wait_iothread(first_cpu->halt_cond);
+
+        /* process any pending work */
+        CPU_FOREACH(cpu) {
+            current_cpu = cpu;
+            qemu_wait_io_event_common(cpu);
+        }
+    }
+
+    start_tcg_kick_timer();
+
+    cpu = first_cpu;
+
+    /* process any pending work */
+    cpu->exit_request = 1;
+
+    while (1) {
+        qemu_mutex_unlock_iothread();
+        replay_mutex_lock();
+        qemu_mutex_lock_iothread();
+
+        if (icount_enabled()) {
+            /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
+            icount_account_warp_timer();
+            /*
+             * Run the timers here.  This is much more efficient than
+             * waking up the I/O thread and waiting for completion.
+             */
+            handle_icount_deadline();
+        }
+
+        replay_mutex_unlock();
+
+        if (!cpu) {
+            cpu = first_cpu;
+        }
+
+        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
+
+            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
+            current_cpu = cpu;
+
+            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
+                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
+
+            if (cpu_can_run(cpu)) {
+                int r;
+
+                qemu_mutex_unlock_iothread();
+                if (icount_enabled()) {
+                    prepare_icount_for_run(cpu);
+                }
+                r = tcg_cpu_exec(cpu);
+                if (icount_enabled()) {
+                    process_icount_data(cpu);
+                }
+                qemu_mutex_lock_iothread();
+
+                if (r == EXCP_DEBUG) {
+                    cpu_handle_guest_debug(cpu);
+                    break;
+                } else if (r == EXCP_ATOMIC) {
+                    qemu_mutex_unlock_iothread();
+                    cpu_exec_step_atomic(cpu);
+                    qemu_mutex_lock_iothread();
+                    break;
+                }
+            } else if (cpu->stop) {
+                if (cpu->unplug) {
+                    cpu = CPU_NEXT(cpu);
+                }
+                break;
+            }
+
+            cpu = CPU_NEXT(cpu);
+        } /* while (cpu && !cpu->exit_request).. */
+
+        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
+        qatomic_set(&tcg_current_rr_cpu, NULL);
+
+        if (cpu && cpu->exit_request) {
+            qatomic_mb_set(&cpu->exit_request, 0);
+        }
+
+        if (icount_enabled() && all_cpu_threads_idle()) {
+            /*
+             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
+             * in the main_loop, wake it up in order to start the warp timer.
+             */
+            qemu_notify_event();
+        }
+
+        qemu_tcg_rr_wait_io_event();
+        deal_with_unplugged_cpus();
+    }
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+const CpusAccel tcg_cpus_rr = {
+    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+
+    .handle_interrupt = tcg_handle_interrupt,
+};
diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.c
+++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
 /*
- * QEMU System Emulator
+ * QEMU TCG vCPU common functionality
+ *
+ * Functionality common to all TCG vCPU variants: mttcg, rr and icount.
  *
  * Copyright (c) 2003-2008 Fabrice Bellard
  * Copyright (c) 2014 Red Hat Inc.
@@ -XXX,XX +XXX,XX @@
 #include "hw/boards.h"
 
 #include "tcg-cpus.h"
+#include "tcg-cpus-mttcg.h"
+#include "tcg-cpus-rr.h"
 
-/* Kick all RR vCPUs */
-static void qemu_cpu_kick_rr_cpus(void)
-{
-    CPUState *cpu;
+/* common functionality among all TCG variants */
 
-    CPU_FOREACH(cpu) {
-        cpu_exit(cpu);
-    };
-}
-
-static void tcg_kick_vcpu_thread(CPUState *cpu)
-{
-    if (qemu_tcg_mttcg_enabled()) {
-        cpu_exit(cpu);
-    } else {
-        qemu_cpu_kick_rr_cpus();
-    }
-}
-
-/*
- * TCG vCPU kick timer
- *
- * The kick timer is responsible for moving single threaded vCPU
- * emulation on to the next vCPU. If more than one vCPU is running a
- * timer event with force a cpu->exit so the next vCPU can get
- * scheduled.
- *
- * The timer is removed if all vCPUs are idle and restarted again once
- * idleness is complete.
- */
-
-static QEMUTimer *tcg_kick_vcpu_timer;
-static CPUState *tcg_current_rr_cpu;
-
-#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
-
-static inline int64_t qemu_tcg_next_kick(void)
-{
-    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
-}
-
-/* Kick the currently round-robin scheduled vCPU to next */
-static void qemu_cpu_kick_rr_next_cpu(void)
-{
-    CPUState *cpu;
-    do {
-        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
-        if (cpu) {
-            cpu_exit(cpu);
-        }
-    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
-}
-
-static void kick_tcg_thread(void *opaque)
-{
-    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-    qemu_cpu_kick_rr_next_cpu();
-}
-
-static void start_tcg_kick_timer(void)
-{
-    assert(!mttcg_enabled);
-    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
-        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
-                                           kick_tcg_thread, NULL);
-    }
-    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
-        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-    }
-}
-
-static void stop_tcg_kick_timer(void)
-{
-    assert(!mttcg_enabled);
-    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
-        timer_del(tcg_kick_vcpu_timer);
-    }
-}
-
-static void qemu_tcg_destroy_vcpu(CPUState *cpu)
-{
-}
-
-static void qemu_tcg_rr_wait_io_event(void)
-{
-    CPUState *cpu;
-
-    while (all_cpu_threads_idle()) {
-        stop_tcg_kick_timer();
-        qemu_cond_wait_iothread(first_cpu->halt_cond);
-    }
-
-    start_tcg_kick_timer();
-
-    CPU_FOREACH(cpu) {
-        qemu_wait_io_event_common(cpu);
-    }
-}
-
-static int64_t tcg_get_icount_limit(void)
-{
-    int64_t deadline;
-
-    if (replay_mode != REPLAY_MODE_PLAY) {
-        /*
-         * Include all the timers, because they may need an attention.
-         * Too long CPU execution may create unnecessary delay in UI.
-         */
-        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
-                                              QEMU_TIMER_ATTR_ALL);
-        /* Check realtime timers, because they help with input processing */
-        deadline = qemu_soonest_timeout(deadline,
-                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
-                                           QEMU_TIMER_ATTR_ALL));
-
-        /*
-         * Maintain prior (possibly buggy) behaviour where if no deadline
-         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
-         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
-         * nanoseconds.
-         */
-        if ((deadline < 0) || (deadline > INT32_MAX)) {
-            deadline = INT32_MAX;
-        }
-
-        return icount_round(deadline);
-    } else {
-        return replay_get_instructions();
-    }
-}
-
-static void notify_aio_contexts(void)
-{
-    /* Wake up other AioContexts.  */
-    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
-    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
-}
-
-static void handle_icount_deadline(void)
-{
-    assert(qemu_in_vcpu_thread());
-    if (icount_enabled()) {
-        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
-                                                      QEMU_TIMER_ATTR_ALL);
-
-        if (deadline == 0) {
-            notify_aio_contexts();
-        }
-    }
-}
-
-static void prepare_icount_for_run(CPUState *cpu)
-{
-    if (icount_enabled()) {
-        int insns_left;
-
-        /*
-         * These should always be cleared by process_icount_data after
-         * each vCPU execution. However u16.high can be raised
-         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
-         */
-        g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
-        g_assert(cpu->icount_extra == 0);
-
-        cpu->icount_budget = tcg_get_icount_limit();
-        insns_left = MIN(0xffff, cpu->icount_budget);
-        cpu_neg(cpu)->icount_decr.u16.low = insns_left;
-        cpu->icount_extra = cpu->icount_budget - insns_left;
-
-        replay_mutex_lock();
-
-        if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
-            notify_aio_contexts();
-        }
-    }
-}
-
-static void process_icount_data(CPUState *cpu)
-{
-    if (icount_enabled()) {
-        /* Account for executed instructions */
-        icount_update(cpu);
-
-        /* Reset the counters */
-        cpu_neg(cpu)->icount_decr.u16.low = 0;
-        cpu->icount_extra = 0;
-        cpu->icount_budget = 0;
-
-        replay_account_executed_instructions();
-
-        replay_mutex_unlock();
-    }
-}
-
-static int tcg_cpu_exec(CPUState *cpu)
-{
-    int ret;
-#ifdef CONFIG_PROFILER
-    int64_t ti;
-#endif
-
-    assert(tcg_enabled());
-#ifdef CONFIG_PROFILER
-    ti = profile_getclock();
-#endif
-    cpu_exec_start(cpu);
-    ret = cpu_exec(cpu);
-    cpu_exec_end(cpu);
-#ifdef CONFIG_PROFILER
-    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
-                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
-#endif
-    return ret;
-}
-
-/*
- * Destroy any remaining vCPUs which have been unplugged and have
- * finished running
- */
-static void deal_with_unplugged_cpus(void)
-{
-    CPUState *cpu;
-
-    CPU_FOREACH(cpu) {
-        if (cpu->unplug && !cpu_can_run(cpu)) {
-            qemu_tcg_destroy_vcpu(cpu);
-            cpu_thread_signal_destroyed(cpu);
-            break;
-        }
-    }
-}
-
-/*
- * Single-threaded TCG
- *
- * In the single-threaded case each vCPU is simulated in turn. If
- * there is more than a single vCPU we create a simple timer to kick
- * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
- * This is done explicitly rather than relying on side-effects
- * elsewhere.
- */
-
-static void *tcg_rr_cpu_thread_fn(void *arg)
-{
-    CPUState *cpu = arg;
-
-    assert(tcg_enabled());
-    rcu_register_thread();
-    tcg_register_thread();
-
-    qemu_mutex_lock_iothread();
-    qemu_thread_get_self(cpu->thread);
-
-    cpu->thread_id = qemu_get_thread_id();
-    cpu->can_do_io = 1;
-    cpu_thread_signal_created(cpu);
-    qemu_guest_random_seed_thread_part2(cpu->random_seed);
-
-    /* wait for initial kick-off after machine start */
-    while (first_cpu->stopped) {
-        qemu_cond_wait_iothread(first_cpu->halt_cond);
-
-        /* process any pending work */
-        CPU_FOREACH(cpu) {
-            current_cpu = cpu;
-            qemu_wait_io_event_common(cpu);
-        }
-    }
-
-    start_tcg_kick_timer();
-
-    cpu = first_cpu;
-
-    /* process any pending work */
-    cpu->exit_request = 1;
-
-    while (1) {
-        qemu_mutex_unlock_iothread();
-        replay_mutex_lock();
-        qemu_mutex_lock_iothread();
-        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
-        icount_account_warp_timer();
-
-        /*
-         * Run the timers here.  This is much more efficient than
-         * waking up the I/O thread and waiting for completion.
-         */
-        handle_icount_deadline();
-
-        replay_mutex_unlock();
-
-        if (!cpu) {
-            cpu = first_cpu;
-        }
-
-        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
-
-            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
-            current_cpu = cpu;
-
-            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
-                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
-
-            if (cpu_can_run(cpu)) {
-                int r;
-
-                qemu_mutex_unlock_iothread();
-                prepare_icount_for_run(cpu);
-
-                r = tcg_cpu_exec(cpu);
-
-                process_icount_data(cpu);
-                qemu_mutex_lock_iothread();
-
-                if (r == EXCP_DEBUG) {
-                    cpu_handle_guest_debug(cpu);
-                    break;
-                } else if (r == EXCP_ATOMIC) {
-                    qemu_mutex_unlock_iothread();
-                    cpu_exec_step_atomic(cpu);
-                    qemu_mutex_lock_iothread();
-                    break;
-                }
-            } else if (cpu->stop) {
-                if (cpu->unplug) {
-                    cpu = CPU_NEXT(cpu);
-                }
-                break;
-            }
-
-            cpu = CPU_NEXT(cpu);
-        } /* while (cpu && !cpu->exit_request).. */
-
-        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
-        qatomic_set(&tcg_current_rr_cpu, NULL);
-
-        if (cpu && cpu->exit_request) {
-            qatomic_mb_set(&cpu->exit_request, 0);
-        }
-
-        if (icount_enabled() && all_cpu_threads_idle()) {
-            /*
-             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
-             * in the main_loop, wake it up in order to start the warp timer.
-             */
-            qemu_notify_event();
-        }
-
-        qemu_tcg_rr_wait_io_event();
-        deal_with_unplugged_cpus();
-    }
-
-    rcu_unregister_thread();
-    return NULL;
-}
-
-/*
- * Multi-threaded TCG
- *
- * In the multi-threaded case each vCPU has its own thread. The TLS
- * variable current_cpu can be used deep in the code to find the
- * current CPUState for a given thread.
- */
-
-static void *tcg_cpu_thread_fn(void *arg)
-{
-    CPUState *cpu = arg;
-
-    assert(tcg_enabled());
-    g_assert(!icount_enabled());
-
-    rcu_register_thread();
-    tcg_register_thread();
-
-    qemu_mutex_lock_iothread();
-    qemu_thread_get_self(cpu->thread);
-
-    cpu->thread_id = qemu_get_thread_id();
-    cpu->can_do_io = 1;
-    current_cpu = cpu;
-    cpu_thread_signal_created(cpu);
-    qemu_guest_random_seed_thread_part2(cpu->random_seed);
-
-    /* process any pending work */
-    cpu->exit_request = 1;
-
-    do {
-        if (cpu_can_run(cpu)) {
-            int r;
-            qemu_mutex_unlock_iothread();
-            r = tcg_cpu_exec(cpu);
-            qemu_mutex_lock_iothread();
-            switch (r) {
-            case EXCP_DEBUG:
-                cpu_handle_guest_debug(cpu);
-                break;
-            case EXCP_HALTED:
-                /*
-                 * during start-up the vCPU is reset and the thread is
-                 * kicked several times. If we don't ensure we go back
-                 * to sleep in the halted state we won't cleanly
-                 * start-up when the vCPU is enabled.
-                 *
-                 * cpu->halted should ensure we sleep in wait_io_event
-                 */
-                g_assert(cpu->halted);
-                break;
-            case EXCP_ATOMIC:
-                qemu_mutex_unlock_iothread();
-                cpu_exec_step_atomic(cpu);
-                qemu_mutex_lock_iothread();
-            default:
-                /* Ignore everything else? */
-                break;
-            }
-        }
-
-        qatomic_mb_set(&cpu->exit_request, 0);
-        qemu_wait_io_event(cpu);
-    } while (!cpu->unplug || cpu_can_run(cpu));
-
-    qemu_tcg_destroy_vcpu(cpu);
-    cpu_thread_signal_destroyed(cpu);
-    qemu_mutex_unlock_iothread();
-    rcu_unregister_thread();
-    return NULL;
-}
-
-static void tcg_start_vcpu_thread(CPUState *cpu)
+void tcg_start_vcpu_thread(CPUState *cpu)
 {
     char thread_name[VCPU_THREAD_NAME_SIZE];
     static QemuCond *single_tcg_halt_cond;
@@ -XXX,XX +XXX,XX @@ static void tcg_start_vcpu_thread(CPUState *cpu)
     }
 }
 
-static int64_t tcg_get_virtual_clock(void)
+void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
-    if (icount_enabled()) {
-        return icount_get();
-    }
-    return cpu_get_clock();
+    cpu_thread_signal_destroyed(cpu);
 }
 
-static int64_t tcg_get_elapsed_ticks(void)
+int tcg_cpu_exec(CPUState *cpu)
 {
-    if (icount_enabled()) {
-        return icount_get();
-    }
-    return cpu_get_ticks();
+    int ret;
+#ifdef CONFIG_PROFILER
+    int64_t ti;
+#endif
+    assert(tcg_enabled());
+#ifdef CONFIG_PROFILER
+    ti = profile_getclock();
+#endif
+    cpu_exec_start(cpu);
+    ret = cpu_exec(cpu);
+    cpu_exec_end(cpu);
+#ifdef CONFIG_PROFILER
+    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
+                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
+#endif
+    return ret;
 }
 
 /* mask must never be zero, except for A20 change call */
-static void tcg_handle_interrupt(CPUState *cpu, int mask)
+void tcg_handle_interrupt(CPUState *cpu, int mask)
 {
-    int old_mask;
     g_assert(qemu_mutex_iothread_locked());
 
-    old_mask = cpu->interrupt_request;
     cpu->interrupt_request |= mask;
 
     /*
@@ -XXX,XX +XXX,XX @@ static void tcg_handle_interrupt(CPUState *cpu, int mask)
         qemu_cpu_kick(cpu);
     } else {
         qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
-        if (icount_enabled() &&
-            !cpu->can_do_io
-            && (mask & ~old_mask) != 0) {
-            cpu_abort(cpu, "Raised interrupt while not in I/O function");
-        }
     }
 }
-
-const CpusAccel tcg_cpus = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
-    .kick_vcpu_thread = tcg_kick_vcpu_thread,
-
-    .handle_interrupt = tcg_handle_interrupt,
-
-    .get_virtual_clock = tcg_get_virtual_clock,
-    .get_elapsed_ticks = tcg_get_elapsed_ticks,
-};
diff --git a/softmmu/icount.c b/softmmu/icount.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/icount.c
+++ b/softmmu/icount.c
@@ -XXX,XX +XXX,XX @@ void icount_start_warp_timer(void)
 
 void icount_account_warp_timer(void)
 {
-    if (!icount_enabled() || !icount_sleep) {
+    if (!icount_sleep) {
         return;
     }
 
diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/meson.build
+++ b/accel/tcg/meson.build
@@ -XXX,XX +XXX,XX @@ tcg_ss.add(when: 'CONFIG_SOFTMMU', if_false: files('user-exec-stub.c'))
 tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c'), libdl])
 specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
 
-specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files('tcg-all.c', 'cputlb.c', 'tcg-cpus.c'))
+specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
+  'tcg-all.c',
+  'cputlb.c',
+  'tcg-cpus.c',
+  'tcg-cpus-mttcg.c',
+  'tcg-cpus-icount.c',
+  'tcg-cpus-rr.c'
+))
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

after the initial split into 3 tcg variants, we proceed to also
split tcg_start_vcpu_thread.

We actually split it in 2 this time, since the icount variant
just uses the round robin function.

Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Message-Id: <20201015143217.29337-3-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-cpus-mttcg.h  | 21 --------------
 accel/tcg/tcg-cpus-rr.h     |  3 +-
 accel/tcg/tcg-cpus.h        |  1 -
 accel/tcg/tcg-all.c         |  5 ++++
 accel/tcg/tcg-cpus-icount.c |  2 +-
 accel/tcg/tcg-cpus-mttcg.c  | 29 +++++++++++++++++--
 accel/tcg/tcg-cpus-rr.c     | 39 +++++++++++++++++++++++--
 accel/tcg/tcg-cpus.c        | 58 -------------------------------------
 8 files changed, 71 insertions(+), 87 deletions(-)
 delete mode 100644 accel/tcg/tcg-cpus-mttcg.h

diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/accel/tcg/tcg-cpus-mttcg.h
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-/*
- * QEMU TCG Multi Threaded vCPUs implementation
- *
- * Copyright 2020 SUSE LLC
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- */
-
-#ifndef TCG_CPUS_MTTCG_H
-#define TCG_CPUS_MTTCG_H
-
-/*
- * In the multi-threaded case each vCPU has its own thread. The TLS
- * variable current_cpu can be used deep in the code to find the
- * current CPUState for a given thread.
- */
-
-void *tcg_cpu_thread_fn(void *arg);
-
-#endif /* TCG_CPUS_MTTCG_H */
diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.h
+++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
 /* Kick all RR vCPUs. */
 void qemu_cpu_kick_rr_cpus(CPUState *unused);
 
-void *tcg_rr_cpu_thread_fn(void *arg);
+/* start the round robin vcpu thread */
+void rr_start_vcpu_thread(CPUState *cpu);
 
 #endif /* TCG_CPUS_RR_H */
diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.h
+++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
 extern const CpusAccel tcg_cpus_icount;
 extern const CpusAccel tcg_cpus_rr;
 
-void tcg_start_vcpu_thread(CPUState *cpu);
 void qemu_tcg_destroy_vcpu(CPUState *cpu);
 int tcg_cpu_exec(CPUState *cpu);
 void tcg_handle_interrupt(CPUState *cpu, int mask);
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
     tcg_exec_init(s->tb_size * 1024 * 1024);
     mttcg_enabled = s->mttcg_enabled;
 
+    /*
+     * Initialize TCG regions
+     */
+    tcg_region_init();
+
     if (mttcg_enabled) {
         cpus_register_accel(&tcg_cpus_mttcg);
     } else if (icount_enabled()) {
diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.c
+++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
 }
 
 const CpusAccel tcg_cpus_icount = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .create_vcpu_thread = rr_start_vcpu_thread,
     .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 
     .handle_interrupt = icount_handle_interrupt,
diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-mttcg.c
+++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/boards.h"
 
 #include "tcg-cpus.h"
-#include "tcg-cpus-mttcg.h"
 
 /*
  * In the multi-threaded case each vCPU has its own thread. The TLS
@@ -XXX,XX +XXX,XX @@
  * current CPUState for a given thread.
  */
 
-void *tcg_cpu_thread_fn(void *arg)
+static void *tcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ static void mttcg_kick_vcpu_thread(CPUState *cpu)
     cpu_exit(cpu);
 }
 
+static void mttcg_start_vcpu_thread(CPUState *cpu)
+{
+    char thread_name[VCPU_THREAD_NAME_SIZE];
+
+    g_assert(tcg_enabled());
+
+    parallel_cpus = (current_machine->smp.max_cpus > 1);
+
+    cpu->thread = g_malloc0(sizeof(QemuThread));
+    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
+    qemu_cond_init(cpu->halt_cond);
+
+    /* create a thread per vCPU with TCG (MTTCG) */
+    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
+             cpu->cpu_index);
+
+    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
+                       cpu, QEMU_THREAD_JOINABLE);
+
+#ifdef _WIN32
+    cpu->hThread = qemu_thread_get_handle(cpu->thread);
+#endif
+}
+
 const CpusAccel tcg_cpus_mttcg = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .create_vcpu_thread = mttcg_start_vcpu_thread,
     .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 
     .handle_interrupt = tcg_handle_interrupt,
diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.c
+++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
  * elsewhere.
  */
 
-void *tcg_rr_cpu_thread_fn(void *arg)
+static void *tcg_rr_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ void *tcg_rr_cpu_thread_fn(void *arg)
     return NULL;
 }
 
+void rr_start_vcpu_thread(CPUState *cpu)
+{
+    char thread_name[VCPU_THREAD_NAME_SIZE];
+    static QemuCond *single_tcg_halt_cond;
+    static QemuThread *single_tcg_cpu_thread;
+
+    g_assert(tcg_enabled());
+    parallel_cpus = false;
+
+    if (!single_tcg_cpu_thread) {
+        cpu->thread = g_malloc0(sizeof(QemuThread));
+        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
+        qemu_cond_init(cpu->halt_cond);
+
+        /* share a single thread for all cpus with TCG */
+        snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
+        qemu_thread_create(cpu->thread, thread_name,
+                           tcg_rr_cpu_thread_fn,
+                           cpu, QEMU_THREAD_JOINABLE);
+
+        single_tcg_halt_cond = cpu->halt_cond;
+        single_tcg_cpu_thread = cpu->thread;
+#ifdef _WIN32
+        cpu->hThread = qemu_thread_get_handle(cpu->thread);
+#endif
+    } else {
+        /* we share the thread */
+        cpu->thread = single_tcg_cpu_thread;
+        cpu->halt_cond = single_tcg_halt_cond;
+        cpu->thread_id = first_cpu->thread_id;
+        cpu->can_do_io = 1;
+        cpu->created = true;
+    }
+}
+
 const CpusAccel tcg_cpus_rr = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .create_vcpu_thread = rr_start_vcpu_thread,
     .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 
     .handle_interrupt = tcg_handle_interrupt,
diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.c
+++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/boards.h"
 
 #include "tcg-cpus.h"
-#include "tcg-cpus-mttcg.h"
-#include "tcg-cpus-rr.h"
 
 /* common functionality among all TCG variants */
 
-void tcg_start_vcpu_thread(CPUState *cpu)
-{
-    char thread_name[VCPU_THREAD_NAME_SIZE];
-    static QemuCond *single_tcg_halt_cond;
-    static QemuThread *single_tcg_cpu_thread;
-    static int tcg_region_inited;
-
-    assert(tcg_enabled());
-    /*
-     * Initialize TCG regions--once. Now is a good time, because:
-     * (1) TCG's init context, prologue and target globals have been set up.
-     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
-     *     -accel flag is processed, so the check doesn't work then).
-     */
-    if (!tcg_region_inited) {
-        tcg_region_inited = 1;
-        tcg_region_init();
-        parallel_cpus = qemu_tcg_mttcg_enabled() && current_machine->smp.max_cpus > 1;
-    }
-
-    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
-        cpu->thread = g_malloc0(sizeof(QemuThread));
-        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
-        qemu_cond_init(cpu->halt_cond);
-
-        if (qemu_tcg_mttcg_enabled()) {
-            /* create a thread per vCPU with TCG (MTTCG) */
-            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
-                 cpu->cpu_index);
-
-            qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
-                               cpu, QEMU_THREAD_JOINABLE);
-
-        } else {
-            /* share a single thread for all cpus with TCG */
-            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
-            qemu_thread_create(cpu->thread, thread_name,
-                               tcg_rr_cpu_thread_fn,
-                               cpu, QEMU_THREAD_JOINABLE);
-
-            single_tcg_halt_cond = cpu->halt_cond;
-            single_tcg_cpu_thread = cpu->thread;
-        }
-#ifdef _WIN32
-        cpu->hThread = qemu_thread_get_handle(cpu->thread);
-#endif
-    } else {
-        /* For non-MTTCG cases we share the thread */
-        cpu->thread = single_tcg_cpu_thread;
-        cpu->halt_cond = single_tcg_halt_cond;
-        cpu->thread_id = first_cpu->thread_id;
-        cpu->can_do_io = 1;
-        cpu->created = true;
-    }
-}
-
 void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
     cpu_thread_signal_destroyed(cpu);
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20201015143217.29337-4-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-cpus-icount.h |  6 +--
 accel/tcg/tcg-cpus-rr.h     |  2 +-
 accel/tcg/tcg-cpus.h        |  6 +--
 accel/tcg/tcg-cpus-icount.c | 24 ++++++------
 accel/tcg/tcg-cpus-mttcg.c  | 10 ++---
 accel/tcg/tcg-cpus-rr.c     | 74 ++++++++++++++++++-------------------
 accel/tcg/tcg-cpus.c        |  6 +--
 7 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.h
+++ b/accel/tcg/tcg-cpus-icount.h
@@ -XXX,XX +XXX,XX @@
 #ifndef TCG_CPUS_ICOUNT_H
 #define TCG_CPUS_ICOUNT_H
 
-void handle_icount_deadline(void);
-void prepare_icount_for_run(CPUState *cpu);
-void process_icount_data(CPUState *cpu);
+void icount_handle_deadline(void);
+void icount_prepare_for_run(CPUState *cpu);
+void icount_process_data(CPUState *cpu);
 
 #endif /* TCG_CPUS_ICOUNT_H */
diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.h
+++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 
 /* Kick all RR vCPUs. */
-void qemu_cpu_kick_rr_cpus(CPUState *unused);
+void rr_kick_vcpu_thread(CPUState *unused);
 
 /* start the round robin vcpu thread */
 void rr_start_vcpu_thread(CPUState *cpu);
diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.h
+++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
 extern const CpusAccel tcg_cpus_icount;
 extern const CpusAccel tcg_cpus_rr;
 
-void qemu_tcg_destroy_vcpu(CPUState *cpu);
-int tcg_cpu_exec(CPUState *cpu);
-void tcg_handle_interrupt(CPUState *cpu, int mask);
+void tcg_cpus_destroy(CPUState *cpu);
+int tcg_cpus_exec(CPUState *cpu);
+void tcg_cpus_handle_interrupt(CPUState *cpu, int mask);
 
 #endif /* TCG_CPUS_H */
diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.c
+++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg-cpus-icount.h"
 #include "tcg-cpus-rr.h"
 
-static int64_t tcg_get_icount_limit(void)
+static int64_t icount_get_limit(void)
 {
     int64_t deadline;
 
@@ -XXX,XX +XXX,XX @@ static int64_t tcg_get_icount_limit(void)
     }
 }
 
-static void notify_aio_contexts(void)
+static void icount_notify_aio_contexts(void)
 {
     /* Wake up other AioContexts.  */
     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
     qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 }
 
-void handle_icount_deadline(void)
+void icount_handle_deadline(void)
 {
     assert(qemu_in_vcpu_thread());
     int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
                                                   QEMU_TIMER_ATTR_ALL);
 
     if (deadline == 0) {
-        notify_aio_contexts();
+        icount_notify_aio_contexts();
     }
 }
 
-void prepare_icount_for_run(CPUState *cpu)
+void icount_prepare_for_run(CPUState *cpu)
 {
     int insns_left;
 
     /*
-     * These should always be cleared by process_icount_data after
+     * These should always be cleared by icount_process_data after
      * each vCPU execution. However u16.high can be raised
-     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
+     * asynchronously by cpu_exit/cpu_interrupt/tcg_cpus_handle_interrupt
      */
     g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
     g_assert(cpu->icount_extra == 0);
 
-    cpu->icount_budget = tcg_get_icount_limit();
+    cpu->icount_budget = icount_get_limit();
     insns_left = MIN(0xffff, cpu->icount_budget);
     cpu_neg(cpu)->icount_decr.u16.low = insns_left;
     cpu->icount_extra = cpu->icount_budget - insns_left;
@@ -XXX,XX +XXX,XX @@ void prepare_icount_for_run(CPUState *cpu)
     replay_mutex_lock();
 
     if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
-        notify_aio_contexts();
+        icount_notify_aio_contexts();
     }
 }
 
-void process_icount_data(CPUState *cpu)
+void icount_process_data(CPUState *cpu)
 {
     /* Account for executed instructions */
     icount_update(cpu);
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
 {
     int old_mask = cpu->interrupt_request;
 
-    tcg_handle_interrupt(cpu, mask);
+    tcg_cpus_handle_interrupt(cpu, mask);
     if (qemu_cpu_is_self(cpu) &&
         !cpu->can_do_io
         && (mask & ~old_mask) != 0) {
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
 
 const CpusAccel tcg_cpus_icount = {
     .create_vcpu_thread = rr_start_vcpu_thread,
-    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+    .kick_vcpu_thread = rr_kick_vcpu_thread,
 
     .handle_interrupt = icount_handle_interrupt,
     .get_virtual_clock = icount_get,
diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-mttcg.c
+++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
  * current CPUState for a given thread.
  */
 
-static void *tcg_cpu_thread_fn(void *arg)
+static void *mttcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
         if (cpu_can_run(cpu)) {
             int r;
             qemu_mutex_unlock_iothread();
-            r = tcg_cpu_exec(cpu);
+            r = tcg_cpus_exec(cpu);
             qemu_mutex_lock_iothread();
             switch (r) {
             case EXCP_DEBUG:
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
         qemu_wait_io_event(cpu);
     } while (!cpu->unplug || cpu_can_run(cpu));
 
-    qemu_tcg_destroy_vcpu(cpu);
+    tcg_cpus_destroy(cpu);
     qemu_mutex_unlock_iothread();
     rcu_unregister_thread();
     return NULL;
@@ -XXX,XX +XXX,XX @@ static void mttcg_start_vcpu_thread(CPUState *cpu)
     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
              cpu->cpu_index);
 
-    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
+    qemu_thread_create(cpu->thread, thread_name, mttcg_cpu_thread_fn,
                        cpu, QEMU_THREAD_JOINABLE);
 
 #ifdef _WIN32
@@ -XXX,XX +XXX,XX @@ const CpusAccel tcg_cpus_mttcg = {
     .create_vcpu_thread = mttcg_start_vcpu_thread,
     .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 
-    .handle_interrupt = tcg_handle_interrupt,
+    .handle_interrupt = tcg_cpus_handle_interrupt,
 };
diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.c
+++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg-cpus-icount.h"
 
 /* Kick all RR vCPUs */
-void qemu_cpu_kick_rr_cpus(CPUState *unused)
+void rr_kick_vcpu_thread(CPUState *unused)
 {
     CPUState *cpu;
 
@@ -XXX,XX +XXX,XX @@ void qemu_cpu_kick_rr_cpus(CPUState *unused)
  * idleness is complete.
  */
 
-static QEMUTimer *tcg_kick_vcpu_timer;
-static CPUState *tcg_current_rr_cpu;
+static QEMUTimer *rr_kick_vcpu_timer;
+static CPUState *rr_current_cpu;
 
 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 
-static inline int64_t qemu_tcg_next_kick(void)
+static inline int64_t rr_next_kick_time(void)
 {
     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 }
 
 /* Kick the currently round-robin scheduled vCPU to next */
-static void qemu_cpu_kick_rr_next_cpu(void)
+static void rr_kick_next_cpu(void)
 {
     CPUState *cpu;
     do {
-        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
+        cpu = qatomic_mb_read(&rr_current_cpu);
         if (cpu) {
             cpu_exit(cpu);
         }
-    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
+    } while (cpu != qatomic_mb_read(&rr_current_cpu));
 }
 
-static void kick_tcg_thread(void *opaque)
+static void rr_kick_thread(void *opaque)
 {
-    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-    qemu_cpu_kick_rr_next_cpu();
+    timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
+    rr_kick_next_cpu();
 }
 
-static void start_tcg_kick_timer(void)
+static void rr_start_kick_timer(void)
 {
-    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
-        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
-                                           kick_tcg_thread, NULL);
+    if (!rr_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
+        rr_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                                           rr_kick_thread, NULL);
     }
-    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
-        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    if (rr_kick_vcpu_timer && !timer_pending(rr_kick_vcpu_timer)) {
+        timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
     }
 }
 
-static void stop_tcg_kick_timer(void)
+static void rr_stop_kick_timer(void)
 {
-    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
-        timer_del(tcg_kick_vcpu_timer);
+    if (rr_kick_vcpu_timer && timer_pending(rr_kick_vcpu_timer)) {
+        timer_del(rr_kick_vcpu_timer);
     }
 }
 
-static void qemu_tcg_rr_wait_io_event(void)
+static void rr_wait_io_event(void)
 {
     CPUState *cpu;
 
     while (all_cpu_threads_idle()) {
-        stop_tcg_kick_timer();
+        rr_stop_kick_timer();
         qemu_cond_wait_iothread(first_cpu->halt_cond);
     }
 
-    start_tcg_kick_timer();
+    rr_start_kick_timer();
 
     CPU_FOREACH(cpu) {
         qemu_wait_io_event_common(cpu);
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_rr_wait_io_event(void)
  * Destroy any remaining vCPUs which have been unplugged and have
  * finished running
  */
-static void deal_with_unplugged_cpus(void)
+static void rr_deal_with_unplugged_cpus(void)
 {
     CPUState *cpu;
 
     CPU_FOREACH(cpu) {
         if (cpu->unplug && !cpu_can_run(cpu)) {
-            qemu_tcg_destroy_vcpu(cpu);
+            tcg_cpus_destroy(cpu);
             break;
         }
     }
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
  * elsewhere.
  */
 
-static void *tcg_rr_cpu_thread_fn(void *arg)
+static void *rr_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
         }
     }
 
-    start_tcg_kick_timer();
+    rr_start_kick_timer();
 
     cpu = first_cpu;
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
              * Run the timers here.  This is much more efficient than
              * waking up the I/O thread and waiting for completion.
              */
-            handle_icount_deadline();
+            icount_handle_deadline();
         }
 
         replay_mutex_unlock();
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
 
         while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 
-            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
+            qatomic_mb_set(&rr_current_cpu, cpu);
             current_cpu = cpu;
 
             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
 
                 qemu_mutex_unlock_iothread();
                 if (icount_enabled()) {
-                    prepare_icount_for_run(cpu);
+                    icount_prepare_for_run(cpu);
                 }
-                r = tcg_cpu_exec(cpu);
+                r = tcg_cpus_exec(cpu);
                 if (icount_enabled()) {
-                    process_icount_data(cpu);
+                    icount_process_data(cpu);
                 }
                 qemu_mutex_lock_iothread();
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
         } /* while (cpu && !cpu->exit_request).. */
 
         /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
-        qatomic_set(&tcg_current_rr_cpu, NULL);
+        qatomic_set(&rr_current_cpu, NULL);
 
         if (cpu && cpu->exit_request) {
             qatomic_mb_set(&cpu->exit_request, 0);
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
             qemu_notify_event();
         }
 
-        qemu_tcg_rr_wait_io_event();
-        deal_with_unplugged_cpus();
+        rr_wait_io_event();
+        rr_deal_with_unplugged_cpus();
     }
 
     rcu_unregister_thread();
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
         /* share a single thread for all cpus with TCG */
         snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
         qemu_thread_create(cpu->thread, thread_name,
-                           tcg_rr_cpu_thread_fn,
+                           rr_cpu_thread_fn,
                            cpu, QEMU_THREAD_JOINABLE);
 
         single_tcg_halt_cond = cpu->halt_cond;
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
 
 const CpusAccel tcg_cpus_rr = {
     .create_vcpu_thread = rr_start_vcpu_thread,
-    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+    .kick_vcpu_thread = rr_kick_vcpu_thread,
 
-    .handle_interrupt = tcg_handle_interrupt,
+    .handle_interrupt = tcg_cpus_handle_interrupt,
 };
diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.c
+++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
 
 /* common functionality among all TCG variants */
 
-void qemu_tcg_destroy_vcpu(CPUState *cpu)
+void tcg_cpus_destroy(CPUState *cpu)
 {
     cpu_thread_signal_destroyed(cpu);
 }
 
-int tcg_cpu_exec(CPUState *cpu)
+int tcg_cpus_exec(CPUState *cpu)
 {
     int ret;
 #ifdef CONFIG_PROFILER
@@ -XXX,XX +XXX,XX @@ int tcg_cpu_exec(CPUState *cpu)
 }
 
 /* mask must never be zero, except for A20 change call */
-void tcg_handle_interrupt(CPUState *cpu, int mask)
+void tcg_cpus_handle_interrupt(CPUState *cpu, int mask)
 {
     g_assert(qemu_mutex_iothread_locked());
 
-- 
2.25.1

Second pull for this week, since this set is large enough by itself.

The following changes since commit 7c9236d6d61f30583d5d860097d88dbf0fe487bf:

Merge tag 'pull-tcg-20230116' of https://gitlab.com/rth7680/qemu into staging (2023-01-17 10:24:16 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230117

for you to fetch changes up to 493c9b19a7fb7f387c4fcf57d3836504d5242bf5:

tcg/riscv: Implement direct branch for goto_tb (2023-01-17 22:36:17 +0000)

----------------------------------------------------------------
tcg: Fix race conditions in (most) goto_tb implementations

----------------------------------------------------------------
Richard Henderson (22):
      tcg: Split out tcg_out_exit_tb
      tcg/i386: Remove unused goto_tb code for indirect jump
      tcg/ppc: Remove unused goto_tb code for indirect jump
      tcg/sparc64: Remove unused goto_tb code for indirect jump
      tcg: Replace asserts on tcg_jmp_insn_offset
      tcg: Introduce set_jmp_insn_offset
      tcg: Introduce get_jmp_target_addr
      tcg: Split out tcg_out_goto_tb
      tcg: Rename TB_JMP_RESET_OFFSET_INVALID to TB_JMP_OFFSET_INVALID
      tcg: Add gen_tb to TCGContext
      tcg: Add TranslationBlock.jmp_insn_offset
      tcg: Change tb_target_set_jmp_target arguments
      tcg: Move tb_target_set_jmp_target declaration to tcg.h
      tcg: Always define tb_target_set_jmp_target
      tcg: Remove TCG_TARGET_HAS_direct_jump
      tcg/aarch64: Reorg goto_tb implementation
      tcg/ppc: Reorg goto_tb implementation
      tcg/sparc64: Remove USE_REG_TB
      tcg/sparc64: Reorg goto_tb implementation
      tcg/arm: Implement direct branch for goto_tb
      tcg/riscv: Introduce OPC_NOP
      tcg/riscv: Implement direct branch for goto_tb

The INDEX_op_exit_tb opcode needs no register allocation.
Split out a dedicated helper function for it.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                        |  4 ++++
 tcg/aarch64/tcg-target.c.inc     | 22 ++++++++++--------
 tcg/arm/tcg-target.c.inc         | 11 +++++----
 tcg/i386/tcg-target.c.inc        | 21 +++++++++--------
 tcg/loongarch64/tcg-target.c.inc | 22 ++++++++++--------
 tcg/mips/tcg-target.c.inc        | 33 +++++++++++++--------------
 tcg/ppc/tcg-target.c.inc         | 11 +++++----
 tcg/riscv/tcg-target.c.inc       | 22 ++++++++++--------
 tcg/s390x/tcg-target.c.inc       | 23 ++++++++++---------
 tcg/sparc64/tcg-target.c.inc     | 39 +++++++++++++++++---------------
 tcg/tci/tcg-target.c.inc         | 10 ++++----
 11 files changed, 121 insertions(+), 97 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
 static void tcg_out_movi(TCGContext *s, TCGType type,
                          TCGReg ret, tcg_target_long arg);
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS]);
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
         case INDEX_op_call:
             tcg_reg_alloc_call(s, op);
             break;
+        case INDEX_op_exit_tb:
+            tcg_out_exit_tb(s, op->args[0]);
+            break;
         case INDEX_op_dup2_vec:
             if (tcg_reg_alloc_dup2(s, op)) {
                 break;
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
 
 static const tcg_insn_unit *tb_ret_addr;
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tcg_out_goto_long(s, tcg_code_gen_epilogue);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
+        tcg_out_goto_long(s, tb_ret_addr);
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        if (a0 == 0) {
-            tcg_out_goto_long(s, tcg_code_gen_epilogue);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
-            tcg_out_goto_long(s, tb_ret_addr);
-        }
-        break;
-
     case INDEX_op_goto_tb:
         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
         /*
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 
 static void tcg_out_epilogue(TCGContext *s);
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
+{
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, arg);
+    tcg_out_epilogue(s);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c;
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]);
-        tcg_out_epilogue(s);
-        break;
     case INDEX_op_goto_tb:
         {
             /* Indirect jump method */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 #endif
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tcg_out_jmp(s, tcg_code_gen_epilogue);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
+        tcg_out_jmp(s, tb_ret_addr);
+    }
+}
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg args[TCG_MAX_OP_ARGS],
                               const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     const_a2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        if (a0 == 0) {
-            tcg_out_jmp(s, tcg_code_gen_epilogue);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
-            tcg_out_jmp(s, tb_ret_addr);
-        }
-        break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_insn_offset) {
             /* direct jump method */
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
 
 static const tcg_insn_unit *tb_ret_addr;
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tcg_out_call_int(s, tcg_code_gen_epilogue, true);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
+        tcg_out_call_int(s, tb_ret_addr, true);
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        if (a0 == 0) {
-            tcg_out_call_int(s, tcg_code_gen_epilogue, true);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
-            tcg_out_call_int(s, tb_ret_addr, true);
-        }
-        break;
-
     case INDEX_op_goto_tb:
         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
         /*
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_clz(TCGContext *s, MIPSInsn opcv2, MIPSInsn opcv6,
     }
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    TCGReg b0 = TCG_REG_ZERO;
+
+    if (a0 & ~0xffff) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, a0 & ~0xffff);
+        b0 = TCG_REG_V0;
+    }
+    if (!tcg_out_opc_jmp(s, OPC_J, tb_ret_addr)) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, (uintptr_t)tb_ret_addr);
+        tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
+    }
+    tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        {
-            TCGReg b0 = TCG_REG_ZERO;
-
-            a0 = (intptr_t)a0;
-            if (a0 & ~0xffff) {
-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, a0 & ~0xffff);
-                b0 = TCG_REG_V0;
-            }
-            if (!tcg_out_opc_jmp(s, OPC_J, tb_ret_addr)) {
-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0,
-                             (uintptr_t)tb_ret_addr);
-                tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
-            }
-            tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
-        }
-        break;
     case INDEX_op_goto_tb:
         /* indirect jump method */
         tcg_debug_assert(s->tb_jmp_insn_offset == 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
     tcg_out32(s, BCLR | BO_ALWAYS);
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
+{
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, arg);
+    tcg_out_b(s, 0, tcg_code_gen_epilogue);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGArg a0, a1, a2;
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, args[0]);
-        tcg_out_b(s, 0, tcg_code_gen_epilogue);
-        break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_insn_offset) {
             /* Direct jump. */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:   /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:   /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
 
 static const tcg_insn_unit *tb_ret_addr;
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tcg_out_call_int(s, tcg_code_gen_epilogue, true);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
+        tcg_out_call_int(s, tb_ret_addr, true);
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        if (a0 == 0) {
-            tcg_out_call_int(s, tcg_code_gen_epilogue, true);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
-            tcg_out_call_int(s, tb_ret_addr, true);
-        }
-        break;
-
     case INDEX_op_goto_tb:
         assert(s->tb_jmp_insn_offset == 0);
         /* indirect jump method */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
 #endif
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tgen_gotoi(s, S390_CC_ALWAYS, tcg_code_gen_epilogue);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, a0);
+        tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
+    }
+}
+
 # define OP_32_64(x) \
         case glue(glue(INDEX_op_,x),_i32): \
         case glue(glue(INDEX_op_,x),_i64)
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGArg a0, a1, a2;
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        a0 = args[0];
-        if (a0 == 0) {
-            tgen_gotoi(s, S390_CC_ALWAYS, tcg_code_gen_epilogue);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, a0);
-            tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
-        }
-        break;
-
     case INDEX_op_goto_tb:
         a0 = args[0];
         /*
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
 #endif /* CONFIG_SOFTMMU */
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    if (check_fit_ptr(a0, 13)) {
+        tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+        tcg_out_movi_imm13(s, TCG_REG_O0, a0);
+        return;
+    } else if (USE_REG_TB) {
+        intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
+        if (check_fit_ptr(tb_diff, 13)) {
+            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+            /* Note that TCG_REG_TB has been unwound to O1.  */
+            tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O1, tb_diff, ARITH_ADD);
+            return;
+        }
+    }
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I0, a0 & ~0x3ff);
+    tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+    tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        if (check_fit_ptr(a0, 13)) {
-            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
-            tcg_out_movi_imm13(s, TCG_REG_O0, a0);
-            break;
-        } else if (USE_REG_TB) {
-            intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
-            if (check_fit_ptr(tb_diff, 13)) {
-                tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
-                /* Note that TCG_REG_TB has been unwound to O1.  */
-                tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O1, tb_diff, ARITH_ADD);
-                break;
-            }
-        }
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I0, a0 & ~0x3ff);
-        tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
-        tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
-        break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_insn_offset) {
             /* direct jump method */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *func,
 # define CASE_64(x)
 #endif
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
+{
+    tcg_out_op_p(s, INDEX_op_exit_tb, (void *)arg);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGOpcode exts;
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        tcg_out_op_p(s, opc, (void *)args[0]);
-        break;
-
     case INDEX_op_goto_tb:
         tcg_debug_assert(s->tb_jmp_insn_offset == 0);
         /* indirect jump method. */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        if (s->tb_jmp_insn_offset) {
-            /* direct jump method */
-            int gap;
-            /* jump displacement must be aligned for atomic patching;
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+        {
+            /*
+             * Jump displacement must be aligned for atomic patching;
              * see if we need to add extra nops before jump
              */
-            gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
+            int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
             if (gap != 1) {
                 tcg_out_nopn(s, gap - 1);
             }
             tcg_out8(s, OPC_JMP_long); /* jmp im */
             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
             tcg_out32(s, 0);
-        } else {
-            /* indirect jump method */
-            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
-                                 (intptr_t)(s->tb_jmp_target_addr + a0));
         }
         set_jmp_reset_offset(s, a0);
         break;
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        if (s->tb_jmp_insn_offset) {
-            /* Direct jump. */
-            if (TCG_TARGET_REG_BITS == 64) {
-                /* Ensure the next insns are 8 or 16-byte aligned. */
-                while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
-                    tcg_out32(s, NOP);
-                }
-                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
-                tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-                tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-            } else {
-                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
-                tcg_out32(s, B);
-                s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
-                break;
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+        /* Direct jump. */
+        if (TCG_TARGET_REG_BITS == 64) {
+            /* Ensure the next insns are 8 or 16-byte aligned. */
+            while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
+                tcg_out32(s, NOP);
             }
+            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+            tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
         } else {
-            /* Indirect jump. */
-            tcg_debug_assert(s->tb_jmp_insn_offset == NULL);
-            tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, 0,
-                       (intptr_t)(s->tb_jmp_insn_offset + args[0]));
+            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+            tcg_out32(s, B);
+            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+            break;
         }
         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
         tcg_out32(s, BCCTR | BO_ALWAYS);
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc64/tcg-target.c.inc | 41 +++++++++++-------------------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
     return false;
 }
 
-static void tcg_out_ld_ptr(TCGContext *s, TCGReg ret, const void *arg)
-{
-    intptr_t diff = tcg_tbrel_diff(s, arg);
-    if (USE_REG_TB && check_fit_ptr(diff, 13)) {
-        tcg_out_ld(s, TCG_TYPE_PTR, ret, TCG_REG_TB, diff);
-        return;
-    }
-    tcg_out_movi(s, TCG_TYPE_PTR, ret, (uintptr_t)arg & ~0x3ff);
-    tcg_out_ld(s, TCG_TYPE_PTR, ret, ret, (uintptr_t)arg & 0x3ff);
-}
-
 static void tcg_out_sety(TCGContext *s, TCGReg rs)
 {
     tcg_out32(s, WRY | INSN_RS1(TCG_REG_G0) | INSN_RS2(rs));
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        if (s->tb_jmp_insn_offset) {
-            /* direct jump method */
-            if (USE_REG_TB) {
-                /* make sure the patch is 8-byte aligned.  */
-                if ((intptr_t)s->code_ptr & 4) {
-                    tcg_out_nop(s);
-                }
-                s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
-                tcg_out_sethi(s, TCG_REG_T1, 0);
-                tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-                tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-                tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-            } else {
-                s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
-                tcg_out32(s, CALL);
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+        /* Direct jump. */
+        if (USE_REG_TB) {
+            /* make sure the patch is 8-byte aligned.  */
+            if ((intptr_t)s->code_ptr & 4) {
                 tcg_out_nop(s);
             }
+            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            tcg_out_sethi(s, TCG_REG_T1, 0);
+            tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+            tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
         } else {
-            /* indirect jump method */
-            tcg_out_ld_ptr(s, TCG_REG_TB, s->tb_jmp_target_addr + a0);
-            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_TB, 0, JMPL);
+            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            tcg_out32(s, CALL);
             tcg_out_nop(s);
         }
         set_jmp_reset_offset(s, a0);
-- 
2.34.1

Test TCG_TARGET_HAS_direct_jump instead of testing an
implementation pointer.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.c.inc     | 2 +-
 tcg/arm/tcg-target.c.inc         | 2 +-
 tcg/loongarch64/tcg-target.c.inc | 2 +-
 tcg/mips/tcg-target.c.inc        | 2 +-
 tcg/riscv/tcg-target.c.inc       | 2 +-
 tcg/tci/tcg-target.c.inc         | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /*
          * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
          * write can be used to patch the target address.
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
             intptr_t ptr, dif, dil;
             TCGReg base = TCG_REG_PC;
 
-            tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+            qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
             ptr = (intptr_t)tcg_splitwx_to_rx(s->tb_jmp_target_addr + args[0]);
             dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
             dil = sextract32(dif, 0, 12);
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /*
          * Ensure that patch area is 8-byte aligned so that an
          * atomic write can be used to patch the target address.
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     switch (opc) {
     case INDEX_op_goto_tb:
         /* indirect jump method */
-        tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
                    (uintptr_t)(s->tb_jmp_target_addr + a0));
         tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        assert(s->tb_jmp_insn_offset == 0);
+        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         /* indirect jump method */
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
                    (uintptr_t)(s->tb_jmp_target_addr + a0));
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         /* indirect jump method. */
         tcg_out_op_p(s, opc, s->tb_jmp_target_addr + args[0]);
         set_jmp_reset_offset(s, args[0]);
-- 
2.34.1

Similar to the existing set_jmp_reset_offset.  Move any assert for
TCG_TARGET_HAS_direct_jump into the new function (which now cannot
be build-time).  Will be unused if TCG_TARGET_HAS_direct_jump is
constant 0, but we can't test for constant in the preprocessor,
so just mark it G_GNUC_UNUSED.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                        | 10 ++++++++++
 tcg/aarch64/tcg-target.c.inc     |  3 +--
 tcg/i386/tcg-target.c.inc        |  3 +--
 tcg/loongarch64/tcg-target.c.inc |  3 +--
 tcg/ppc/tcg-target.c.inc         |  7 +++----
 tcg/s390x/tcg-target.c.inc       |  2 +-
 tcg/sparc64/tcg-target.c.inc     |  5 ++---
 7 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
     s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
 }
 
+static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
+{
+    /*
+     * We will check for overflow at the end of the opcode loop in
+     * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
+     */
+    tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
+    s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
+}
+
 /* Signal overflow, starting over with fewer guest insns. */
 static G_NORETURN
 void tcg_raise_tb_overflow(TCGContext *s)
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /*
          * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
          * write can be used to patch the target address.
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         if ((uintptr_t)s->code_ptr & 7) {
             tcg_out32(s, NOP);
         }
-        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+        set_jmp_insn_offset(s, a0);
         /*
          * actual branch destination will be patched by
          * tb_target_set_jmp_target later
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         {
             /*
              * Jump displacement must be aligned for atomic patching;
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                 tcg_out_nopn(s, gap - 1);
             }
             tcg_out8(s, OPC_JMP_long); /* jmp im */
-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, a0);
             tcg_out32(s, 0);
         }
         set_jmp_reset_offset(s, a0);
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /*
          * Ensure that patch area is 8-byte aligned so that an
          * atomic write can be used to patch the target address.
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         if ((uintptr_t)s->code_ptr & 7) {
             tcg_out_nop(s);
         }
-        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+        set_jmp_insn_offset(s, a0);
         /*
          * actual branch destination will be patched by
          * tb_target_set_jmp_target later
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /* Direct jump. */
         if (TCG_TARGET_REG_BITS == 64) {
             /* Ensure the next insns are 8 or 16-byte aligned. */
             while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
                 tcg_out32(s, NOP);
             }
-            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, args[0]);
             tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
             tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
         } else {
-            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, args[0]);
             tcg_out32(s, B);
-            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+            set_jmp_reset_offset(s, args[0]);
             break;
         }
         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
             tcg_out16(s, NOP);
         }
         tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
-        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+        set_jmp_insn_offset(s, a0);
         s->code_ptr += 2;
         set_jmp_reset_offset(s, a0);
         break;
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /* Direct jump. */
         if (USE_REG_TB) {
             /* make sure the patch is 8-byte aligned.  */
             if ((intptr_t)s->code_ptr & 4) {
                 tcg_out_nop(s);
             }
-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, a0);
             tcg_out_sethi(s, TCG_REG_T1, 0);
             tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
             tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
             tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
         } else {
-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, a0);
             tcg_out32(s, CALL);
             tcg_out_nop(s);
         }
-- 
2.34.1

Similar to the existing set_jmp_reset_offset.  Include the
rw->rx address space conversion done by arm and s390x, and
forgotten by mips and riscv.

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
     s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
 }
 
+static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
+{
+    /*
+     * Return the read-execute version of the pointer, for the benefit
+     * of any pc-relative addressing mode.
+     */
+    return (uintptr_t)tcg_splitwx_to_rx(&s->tb_jmp_target_addr[which]);
+}
+
 /* Signal overflow, starting over with fewer guest insns. */
 static G_NORETURN
 void tcg_raise_tb_overflow(TCGContext *s)
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
             TCGReg base = TCG_REG_PC;
 
             qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-            ptr = (intptr_t)tcg_splitwx_to_rx(s->tb_jmp_target_addr + args[0]);
+            ptr = get_jmp_target_addr(s, args[0]);
             dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
             dil = sextract32(dif, 0, 12);
             if (dif != dil) {
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         /* indirect jump method */
         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
-                   (uintptr_t)(s->tb_jmp_target_addr + a0));
+                   get_jmp_target_addr(s, a0));
         tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
         tcg_out_nop(s);
         set_jmp_reset_offset(s, a0);
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         /* indirect jump method */
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
-                   (uintptr_t)(s->tb_jmp_target_addr + a0));
+                   get_jmp_target_addr(s, a0));
         tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
         set_jmp_reset_offset(s, a0);
         break;
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_goto_tb:
         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         /* indirect jump method. */
-        tcg_out_op_p(s, opc, s->tb_jmp_target_addr + args[0]);
+        tcg_out_op_p(s, opc, (void *)get_jmp_target_addr(s, args[0]));
         set_jmp_reset_offset(s, args[0]);
         break;
 
-- 
2.34.1

The INDEX_op_goto_tb opcode needs no register allocation.
Split out a dedicated helper function for it.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                        |  4 ++
 tcg/aarch64/tcg-target.c.inc     | 40 ++++++++++---------
 tcg/arm/tcg-target.c.inc         | 49 ++++++++++++-----------
 tcg/i386/tcg-target.c.inc        | 33 ++++++++--------
 tcg/loongarch64/tcg-target.c.inc | 38 +++++++++---------
 tcg/mips/tcg-target.c.inc        | 21 +++++-----
 tcg/ppc/tcg-target.c.inc         | 52 ++++++++++++------------
 tcg/riscv/tcg-target.c.inc       | 20 +++++-----
 tcg/s390x/tcg-target.c.inc       | 31 ++++++++-------
 tcg/sparc64/tcg-target.c.inc     | 68 +++++++++++++++++---------------
 tcg/tci/tcg-target.c.inc         | 16 ++++----
 11 files changed, 199 insertions(+), 173 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
 static void tcg_out_movi(TCGContext *s, TCGType type,
                          TCGReg ret, tcg_target_long arg);
 static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
+static void tcg_out_goto_tb(TCGContext *s, int which);
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS]);
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
         case INDEX_op_exit_tb:
             tcg_out_exit_tb(s, op->args[0]);
             break;
+        case INDEX_op_goto_tb:
+            tcg_out_goto_tb(s, op->args[0]);
+            break;
         case INDEX_op_dup2_vec:
             if (tcg_reg_alloc_dup2(s, op)) {
                 break;
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /*
+     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
+     * write can be used to patch the target address.
+     */
+    if ((uintptr_t)s->code_ptr & 7) {
+        tcg_out32(s, NOP);
+    }
+    set_jmp_insn_offset(s, which);
+    /*
+     * actual branch destination will be patched by
+     * tb_target_set_jmp_target later
+     */
+    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
+    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
+    tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /*
-         * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
-         * write can be used to patch the target address.
-         */
-        if ((uintptr_t)s->code_ptr & 7) {
-            tcg_out32(s, NOP);
-        }
-        set_jmp_insn_offset(s, a0);
-        /*
-         * actual branch destination will be patched by
-         * tb_target_set_jmp_target later
-         */
-        tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
-        tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
-        tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
-        set_jmp_reset_offset(s, a0);
-        break;
-
     case INDEX_op_goto_ptr:
         tcg_out_insn(s, 3207, BR, a0);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
     tcg_out_epilogue(s);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /* Indirect jump method */
+    intptr_t ptr, dif, dil;
+    TCGReg base = TCG_REG_PC;
+
+    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+    ptr = get_jmp_target_addr(s, which);
+    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
+    dil = sextract32(dif, 0, 12);
+    if (dif != dil) {
+        /*
+         * The TB is close, but outside the 12 bits addressable by
+         * the load.  We can extend this to 20 bits with a sub of a
+         * shifted immediate from pc.  In the vastly unlikely event
+         * the code requires more than 1MB, we'll use 2 insns and
+         * be no worse off.
+         */
+        base = TCG_REG_R0;
+        tcg_out_movi32(s, COND_AL, base, ptr - dil);
+    }
+    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c;
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        {
-            /* Indirect jump method */
-            intptr_t ptr, dif, dil;
-            TCGReg base = TCG_REG_PC;
-
-            qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-            ptr = get_jmp_target_addr(s, args[0]);
-            dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
-            dil = sextract32(dif, 0, 12);
-            if (dif != dil) {
-                /* The TB is close, but outside the 12 bits addressable by
-                   the load.  We can extend this to 20 bits with a sub of a
-                   shifted immediate from pc.  In the vastly unlikely event
-                   the code requires more than 1MB, we'll use 2 insns and
-                   be no worse off.  */
-                base = TCG_REG_R0;
-                tcg_out_movi32(s, COND_AL, base, ptr - dil);
-            }
-            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
-            set_jmp_reset_offset(s, args[0]);
-        }
-        break;
     case INDEX_op_goto_ptr:
         tcg_out_b_reg(s, COND_AL, args[0]);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /*
+     * Jump displacement must be aligned for atomic patching;
+     * see if we need to add extra nops before jump
+     */
+    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
+    if (gap != 1) {
+        tcg_out_nopn(s, gap - 1);
+    }
+    tcg_out8(s, OPC_JMP_long); /* jmp im */
+    set_jmp_insn_offset(s, which);
+    tcg_out32(s, 0);
+    set_jmp_reset_offset(s, which);
+}
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg args[TCG_MAX_OP_ARGS],
                               const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     const_a2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        {
-            /*
-             * Jump displacement must be aligned for atomic patching;
-             * see if we need to add extra nops before jump
-             */
-            int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
-            if (gap != 1) {
-                tcg_out_nopn(s, gap - 1);
-            }
-            tcg_out8(s, OPC_JMP_long); /* jmp im */
-            set_jmp_insn_offset(s, a0);
-            tcg_out32(s, 0);
-        }
-        set_jmp_reset_offset(s, a0);
-        break;
     case INDEX_op_goto_ptr:
         /* jmp to the given host address (could be epilogue) */
         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /*
+     * Ensure that patch area is 8-byte aligned so that an
+     * atomic write can be used to patch the target address.
+     */
+    if ((uintptr_t)s->code_ptr & 7) {
+        tcg_out_nop(s);
+    }
+    set_jmp_insn_offset(s, which);
+    /*
+     * actual branch destination will be patched by
+     * tb_target_set_jmp_target later
+     */
+    tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
+    tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /*
-         * Ensure that patch area is 8-byte aligned so that an
-         * atomic write can be used to patch the target address.
-         */
-        if ((uintptr_t)s->code_ptr & 7) {
-            tcg_out_nop(s);
-        }
-        set_jmp_insn_offset(s, a0);
-        /*
-         * actual branch destination will be patched by
-         * tb_target_set_jmp_target later
-         */
-        tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
-        tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
-        set_jmp_reset_offset(s, a0);
-        break;
-
     case INDEX_op_mb:
         tcg_out_mb(s, a0);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /* indirect jump method */
+    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
+               get_jmp_target_addr(s, which));
+    tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
+    tcg_out_nop(s);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /* indirect jump method */
-        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
-                   get_jmp_target_addr(s, a0));
-        tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
-        tcg_out_nop(s);
-        set_jmp_reset_offset(s, a0);
-        break;
     case INDEX_op_goto_ptr:
         /* jmp to the given host address (could be epilogue) */
         tcg_out_opc_reg(s, OPC_JR, 0, a0, 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
     tcg_out_b(s, 0, tcg_code_gen_epilogue);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /* Direct jump. */
+    if (TCG_TARGET_REG_BITS == 64) {
+        /* Ensure the next insns are 8 or 16-byte aligned. */
+        while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
+            tcg_out32(s, NOP);
+        }
+        set_jmp_insn_offset(s, which);
+        tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
+        tcg_out32(s, BCCTR | BO_ALWAYS);
+        set_jmp_reset_offset(s, which);
+        if (USE_REG_TB) {
+            /* For the unlinked case, need to reset TCG_REG_TB.  */
+            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
+                             -tcg_current_code_size(s));
+        }
+    } else {
+        set_jmp_insn_offset(s, which);
+        tcg_out32(s, B);
+        set_jmp_reset_offset(s, which);
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGArg a0, a1, a2;
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /* Direct jump. */
-        if (TCG_TARGET_REG_BITS == 64) {
-            /* Ensure the next insns are 8 or 16-byte aligned. */
-            while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
-                tcg_out32(s, NOP);
-            }
-            set_jmp_insn_offset(s, args[0]);
-            tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-        } else {
-            set_jmp_insn_offset(s, args[0]);
-            tcg_out32(s, B);
-            set_jmp_reset_offset(s, args[0]);
-            break;
-        }
-        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-        set_jmp_reset_offset(s, args[0]);
-        if (USE_REG_TB) {
-            /* For the unlinked case, need to reset TCG_REG_TB.  */
-            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
-                             -tcg_current_code_size(s));
-        }
-        break;
     case INDEX_op_goto_ptr:
         tcg_out32(s, MTSPR | RS(args[0]) | CTR);
         if (USE_REG_TB) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:   /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:   /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+    /* indirect jump method */
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
+               get_jmp_target_addr(s, which));
+    tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-        /* indirect jump method */
-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
-                   get_jmp_target_addr(s, a0));
-        tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
-        set_jmp_reset_offset(s, a0);
-        break;
-
     case INDEX_op_goto_ptr:
         tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, a0, 0);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /*
+     * Branch displacement must be aligned for atomic patching;
+     * see if we need to add extra nop before branch
+     */
+    if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
+        tcg_out16(s, NOP);
+    }
+    tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
+    set_jmp_insn_offset(s, which);
+    s->code_ptr += 2;
+    set_jmp_reset_offset(s, which);
+}
+
 # define OP_32_64(x) \
         case glue(glue(INDEX_op_,x),_i32): \
         case glue(glue(INDEX_op_,x),_i64)
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGArg a0, a1, a2;
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        a0 = args[0];
-        /*
-         * branch displacement must be aligned for atomic patching;
-         * see if we need to add extra nop before branch
-         */
-        if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
-            tcg_out16(s, NOP);
-        }
-        tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
-        set_jmp_insn_offset(s, a0);
-        s->code_ptr += 2;
-        set_jmp_reset_offset(s, a0);
-        break;
-
     case INDEX_op_goto_ptr:
         a0 = args[0];
         tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, a0);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /* Direct jump. */
+    if (USE_REG_TB) {
+        /* make sure the patch is 8-byte aligned.  */
+        if ((intptr_t)s->code_ptr & 4) {
+            tcg_out_nop(s);
+        }
+        set_jmp_insn_offset(s, which);
+        tcg_out_sethi(s, TCG_REG_T1, 0);
+        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+        tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+    } else {
+        set_jmp_insn_offset(s, which);
+        tcg_out32(s, CALL);
+        tcg_out_nop(s);
+    }
+    set_jmp_reset_offset(s, which);
+
+    /*
+     * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
+     * to the beginning of this TB.
+     */
+    if (USE_REG_TB) {
+        int c = -tcg_current_code_size(s);
+        if (check_fit_i32(c, 13)) {
+            tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
+        } else {
+            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
+            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+        }
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /* Direct jump. */
-        if (USE_REG_TB) {
-            /* make sure the patch is 8-byte aligned.  */
-            if ((intptr_t)s->code_ptr & 4) {
-                tcg_out_nop(s);
-            }
-            set_jmp_insn_offset(s, a0);
-            tcg_out_sethi(s, TCG_REG_T1, 0);
-            tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-            tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-        } else {
-            set_jmp_insn_offset(s, a0);
-            tcg_out32(s, CALL);
-            tcg_out_nop(s);
-        }
-        set_jmp_reset_offset(s, a0);
-
-        /* For the unlinked path of goto_tb, we need to reset
-           TCG_REG_TB to the beginning of this TB.  */
-        if (USE_REG_TB) {
-            c = -tcg_current_code_size(s);
-            if (check_fit_i32(c, 13)) {
-                tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
-            } else {
-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
-                tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB,
-                              TCG_REG_T1, ARITH_ADD);
-            }
-        }
-        break;
     case INDEX_op_goto_ptr:
         tcg_out_arithi(s, TCG_REG_G0, a0, 0, JMPL);
         if (USE_REG_TB) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
     tcg_out_op_p(s, INDEX_op_exit_tb, (void *)arg);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+    /* indirect jump method. */
+    tcg_out_op_p(s, INDEX_op_goto_tb, (void *)get_jmp_target_addr(s, which));
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGOpcode exts;
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-        /* indirect jump method. */
-        tcg_out_op_p(s, opc, (void *)get_jmp_target_addr(s, args[0]));
-        set_jmp_reset_offset(s, args[0]);
-        break;
-
     case INDEX_op_goto_ptr:
         tcg_out_op_r(s, opc, args[0]);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
-- 
2.34.1

This will shortly be used for more than reset.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h   | 2 +-
 accel/tcg/translate-all.c | 8 ++++----
 tcg/tcg.c                 | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
      * setting one of the jump targets (or patching the jump instruction). Only
      * two of such jumps are supported.
      */
+#define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
     uint16_t jmp_reset_offset[2]; /* offset of original jump target */
-#define TB_JMP_RESET_OFFSET_INVALID 0xffff /* indicates no jump generated */
     uintptr_t jmp_target_arg[2];  /* target address or offset */
 
     /*
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     tb->jmp_dest[1] = (uintptr_t)NULL;
 
     /* init original jump addresses which have been set during tcg_gen_code() */
-    if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
+    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
         tb_reset_jump(tb, 0);
     }
-    if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
+    if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
         tb_reset_jump(tb, 1);
     }
 
@@ -XXX,XX +XXX,XX @@ static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
     if (tb_page_addr1(tb) != -1) {
         tst->cross_page++;
     }
-    if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
+    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
         tst->direct_jmp_count++;
-        if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
+        if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
             tst->direct_jmp2_count++;
         }
     }
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
 #endif
 
     /* Initialize goto_tb jump offsets. */
-    tb->jmp_reset_offset[0] = TB_JMP_RESET_OFFSET_INVALID;
-    tb->jmp_reset_offset[1] = TB_JMP_RESET_OFFSET_INVALID;
+    tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
+    tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
     tcg_ctx->tb_jmp_reset_offset = tb->jmp_reset_offset;
     if (TCG_TARGET_HAS_direct_jump) {
         tcg_ctx->tb_jmp_insn_offset = tb->jmp_target_arg;
-- 
2.34.1

This can replace four other variables that are references
into the TranslationBlock structure.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h         | 11 +++--------
 accel/tcg/translate-all.c |  2 +-
 tcg/tcg-op.c              | 14 +++++++-------
 tcg/tcg.c                 | 14 +++-----------
 4 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
     int nb_indirects;
     int nb_ops;
 
-    /* goto_tb support */
-    tcg_insn_unit *code_buf;
-    uint16_t *tb_jmp_reset_offset; /* tb->jmp_reset_offset */
-    uintptr_t *tb_jmp_insn_offset; /* tb->jmp_target_arg if direct_jump */
-    uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_arg if !direct_jump */
-
     TCGRegSet reserved_regs;
-    uint32_t tb_cflags; /* cflags of the current TB */
     intptr_t current_frame_offset;
     intptr_t frame_start;
     intptr_t frame_end;
     TCGTemp *frame_temp;
 
-    tcg_insn_unit *code_ptr;
+    TranslationBlock *gen_tb;     /* tb for which code is being generated */
+    tcg_insn_unit *code_buf;      /* pointer for start of tb */
+    tcg_insn_unit *code_ptr;      /* pointer for running end of tb */
 
 #ifdef CONFIG_PROFILER
     TCGProfile prof;
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     tb->trace_vcpu_dstate = *cpu->trace_dstate;
     tb_set_page_addr0(tb, phys_pc);
     tb_set_page_addr1(tb, -1);
-    tcg_ctx->tb_cflags = cflags;
+    tcg_ctx->gen_tb = tb;
  tb_overflow:
 
 #ifdef CONFIG_PROFILER
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_op6(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3,
 
 void tcg_gen_mb(TCGBar mb_type)
 {
-    if (tcg_ctx->tb_cflags & CF_PARALLEL) {
+    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {
         tcg_gen_op1(INDEX_op_mb, mb_type);
     }
 }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_exit_tb(const TranslationBlock *tb, unsigned idx)
 void tcg_gen_goto_tb(unsigned idx)
 {
     /* We tested CF_NO_GOTO_TB in translator_use_goto_tb. */
-    tcg_debug_assert(!(tcg_ctx->tb_cflags & CF_NO_GOTO_TB));
+    tcg_debug_assert(!(tcg_ctx->gen_tb->cflags & CF_NO_GOTO_TB));
     /* We only support two chained exits.  */
     tcg_debug_assert(idx <= TB_EXIT_IDXMAX);
 #ifdef CONFIG_DEBUG_TCG
@@ -XXX,XX +XXX,XX @@ void tcg_gen_lookup_and_goto_ptr(void)
 {
     TCGv_ptr ptr;
 
-    if (tcg_ctx->tb_cflags & CF_NO_GOTO_PTR) {
+    if (tcg_ctx->gen_tb->cflags & CF_NO_GOTO_PTR) {
         tcg_gen_exit_tb(NULL, 0);
         return;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
 {
     memop = tcg_canonicalize_memop(memop, 0, 0);
 
-    if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) {
+    if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) {
         TCGv_i32 t1 = tcg_temp_new_i32();
         TCGv_i32 t2 = tcg_temp_new_i32();
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
 {
     memop = tcg_canonicalize_memop(memop, 1, 0);
 
-    if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) {
+    if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) {
         TCGv_i64 t1 = tcg_temp_new_i64();
         TCGv_i64 t2 = tcg_temp_new_i64();
 
@@ -XXX,XX +XXX,XX @@ static void * const table_##NAME[(MO_SIZE | MO_BSWAP) + 1] = {          \
 void tcg_gen_atomic_##NAME##_i32                                        \
     (TCGv_i32 ret, TCGv addr, TCGv_i32 val, TCGArg idx, MemOp memop)    \
 {                                                                       \
-    if (tcg_ctx->tb_cflags & CF_PARALLEL) {                             \
+    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {                        \
         do_atomic_op_i32(ret, addr, val, idx, memop, table_##NAME);     \
     } else {                                                            \
         do_nonatomic_op_i32(ret, addr, val, idx, memop, NEW,            \
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_##NAME##_i32                                        \
 void tcg_gen_atomic_##NAME##_i64                                        \
     (TCGv_i64 ret, TCGv addr, TCGv_i64 val, TCGArg idx, MemOp memop)    \
 {                                                                       \
-    if (tcg_ctx->tb_cflags & CF_PARALLEL) {                             \
+    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {                        \
         do_atomic_op_i64(ret, addr, val, idx, memop, table_##NAME);     \
     } else {                                                            \
         do_nonatomic_op_i64(ret, addr, val, idx, memop, NEW,            \
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
      * We will check for overflow at the end of the opcode loop in
      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
      */
-    s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
+    s->gen_tb->jmp_reset_offset[which] = tcg_current_code_size(s);
 }
 
 static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
      */
     tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
-    s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
+    s->gen_tb->jmp_target_arg[which] = tcg_current_code_size(s);
 }
 
 static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
      * Return the read-execute version of the pointer, for the benefit
      * of any pc-relative addressing mode.
      */
-    return (uintptr_t)tcg_splitwx_to_rx(&s->tb_jmp_target_addr[which]);
+    return (uintptr_t)tcg_splitwx_to_rx(s->gen_tb->jmp_target_arg + which);
 }
 
 /* Signal overflow, starting over with fewer guest insns. */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
     /* Initialize goto_tb jump offsets. */
     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
     tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
-    tcg_ctx->tb_jmp_reset_offset = tb->jmp_reset_offset;
-    if (TCG_TARGET_HAS_direct_jump) {
-        tcg_ctx->tb_jmp_insn_offset = tb->jmp_target_arg;
-        tcg_ctx->tb_jmp_target_addr = NULL;
-    } else {
-        tcg_ctx->tb_jmp_insn_offset = NULL;
-        tcg_ctx->tb_jmp_target_addr = tb->jmp_target_arg;
-    }
 
     tcg_reg_alloc_start(s);
 
-- 
2.34.1

Stop overloading jmp_target_arg for both offset and address,
depending on TCG_TARGET_HAS_direct_jump.  Instead, add a new
field to hold the jump insn offset and always set the target
address in jmp_target_addr[].  This will allow a tcg backend
to use either direct or indirect depending on displacement.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h | 3 ++-
 accel/tcg/cpu-exec.c    | 5 ++---
 tcg/tcg.c               | 6 ++++--
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
      */
 #define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
     uint16_t jmp_reset_offset[2]; /* offset of original jump target */
-    uintptr_t jmp_target_arg[2];  /* target address or offset */
+    uint16_t jmp_insn_offset[2];  /* offset of direct jump insn */
+    uintptr_t jmp_target_addr[2]; /* target address */
 
     /*
      * Each TB has a NULL-terminated list (jmp_list_head) of incoming jumps.
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
 
 void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
 {
+    tb->jmp_target_addr[n] = addr;
     if (TCG_TARGET_HAS_direct_jump) {
-        uintptr_t offset = tb->jmp_target_arg[n];
+        uintptr_t offset = tb->jmp_insn_offset[n];
         uintptr_t tc_ptr = (uintptr_t)tb->tc.ptr;
         uintptr_t jmp_rx = tc_ptr + offset;
         uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
         tb_target_set_jmp_target(tc_ptr, jmp_rx, jmp_rw, addr);
-    } else {
-        tb->jmp_target_arg[n] = addr;
     }
 }
 
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
      */
     tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
-    s->gen_tb->jmp_target_arg[which] = tcg_current_code_size(s);
+    s->gen_tb->jmp_insn_offset[which] = tcg_current_code_size(s);
 }
 
 static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
      * Return the read-execute version of the pointer, for the benefit
      * of any pc-relative addressing mode.
      */
-    return (uintptr_t)tcg_splitwx_to_rx(s->gen_tb->jmp_target_arg + which);
+    return (uintptr_t)tcg_splitwx_to_rx(&s->gen_tb->jmp_target_addr[which]);
 }
 
 /* Signal overflow, starting over with fewer guest insns. */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
     /* Initialize goto_tb jump offsets. */
     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
     tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
+    tb->jmp_insn_offset[0] = TB_JMP_OFFSET_INVALID;
+    tb->jmp_insn_offset[1] = TB_JMP_OFFSET_INVALID;
 
     tcg_reg_alloc_start(s);
 
-- 
2.34.1

Replace 'tc_ptr' and 'addr' with 'tb' and 'n'.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.h         |  3 ++-
 tcg/arm/tcg-target.h             |  3 ++-
 tcg/i386/tcg-target.h            |  9 ++-------
 tcg/loongarch64/tcg-target.h     |  3 ++-
 tcg/mips/tcg-target.h            |  3 ++-
 tcg/ppc/tcg-target.h             |  3 ++-
 tcg/riscv/tcg-target.h           |  3 ++-
 tcg/s390x/tcg-target.h           | 10 ++--------
 tcg/sparc64/tcg-target.h         |  3 ++-
 tcg/tci/tcg-target.h             |  3 ++-
 accel/tcg/cpu-exec.c             | 11 ++++++++---
 tcg/aarch64/tcg-target.c.inc     |  5 +++--
 tcg/i386/tcg-target.c.inc        |  9 +++++++++
 tcg/loongarch64/tcg-target.c.inc |  5 +++--
 tcg/ppc/tcg-target.c.inc         |  7 ++++---
 tcg/s390x/tcg-target.c.inc       | 10 ++++++++++
 tcg/sparc64/tcg-target.c.inc     |  7 ++++---
 17 files changed, 61 insertions(+), 36 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     0
 
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *, int,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_MEMORY_BSWAP     0
 
 /* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
 #define TCG_TARGET_extract_i64_valid(ofs, len) \
     (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
 
-static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                                            uintptr_t jmp_rw, uintptr_t addr)
-{
-    /* patch the branch destination */
-    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
-    /* no need to flush icache explicitly */
-}
+void tb_target_set_jmp_target(const TranslationBlock *, int,
+                              uintptr_t, uintptr_t);
 
 /* This defines the natural memory order supported by this
  * architecture before guarantees made by various barrier
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1
 
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_DEFAULT_MO (0)
 
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
 /* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t)
     QEMU_ERROR("code path is reachable");
 
 #define TCG_TARGET_NEED_LDST_LABELS
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_bitsel_vec       have_vsx
 #define TCG_TARGET_HAS_cmpsel_vec       0
 
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #endif
 
 /* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_DEFAULT_MO (0)
 
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
 
 #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
 
-static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                                            uintptr_t jmp_rw, uintptr_t addr)
-{
-    /* patch the branch destination */
-    intptr_t disp = addr - (jmp_rx - 2);
-    qatomic_set((int32_t *)jmp_rw, disp / 2);
-    /* no need to flush icache explicitly */
-}
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw);
 
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
 /* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #endif /* TCG_TARGET_H */
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
 {
     tb->jmp_target_addr[n] = addr;
     if (TCG_TARGET_HAS_direct_jump) {
+        /*
+         * Get the rx view of the structure, from which we find the
+         * executable code address, and tb_target_set_jmp_target can
+         * produce a pc-relative displacement to jmp_target_addr[n].
+         */
+        const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
         uintptr_t offset = tb->jmp_insn_offset[n];
-        uintptr_t tc_ptr = (uintptr_t)tb->tc.ptr;
-        uintptr_t jmp_rx = tc_ptr + offset;
+        uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
         uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
-        tb_target_set_jmp_target(tc_ptr, jmp_rx, jmp_rw, addr);
+        tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
     }
 }
 
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
     tcg_out_call_int(s, target);
 }
 
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                              uintptr_t jmp_rw, uintptr_t addr)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
+    uintptr_t addr = tb->jmp_target_addr[n];
     tcg_insn_unit i1, i2;
     TCGType rt = TCG_TYPE_I64;
     TCGReg  rd = TCG_REG_TMP;
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* patch the branch destination */
+    uintptr_t addr = tb->jmp_target_addr[n];
+    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
+    /* no need to flush icache explicitly */
+}
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg args[TCG_MAX_OP_ARGS],
                               const int const_args[TCG_MAX_OP_ARGS])
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop(TCGContext *s)
     tcg_out32(s, NOP);
 }
 
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                              uintptr_t jmp_rw, uintptr_t addr)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
     tcg_insn_unit i1, i2;
     ptrdiff_t upper, lower;
+    uintptr_t addr = tb->jmp_target_addr[n];
     ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
 
     if (offset == sextreg(offset, 0, 26)) {
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
     flush_idcache_range(rx, rw, 16);
 }
 
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                              uintptr_t jmp_rw, uintptr_t addr)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
     tcg_insn_unit i0, i1, i2, i3;
-    intptr_t tb_diff = addr - tc_ptr;
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t tb_diff = addr - (uintptr_t)tb->tc.ptr;
     intptr_t br_diff = addr - (jmp_rx + 4);
     intptr_t lo, hi;
 
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* patch the branch destination */
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t disp = addr - (jmp_rx - 2);
+    qatomic_set((int32_t *)jmp_rw, disp / 2);
+    /* no need to flush icache explicitly */
+}
+
 # define OP_32_64(x) \
         case glue(glue(INDEX_op_,x),_i32): \
         case glue(glue(INDEX_op_,x),_i64)
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tcg_register_jit(const void *buf, size_t buf_size)
     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
 }
 
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                              uintptr_t jmp_rw, uintptr_t addr)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
-    intptr_t tb_disp = addr - tc_ptr;
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t tb_disp = addr - (uintptr_t)tb->tc.ptr;
     intptr_t br_disp = addr - jmp_rx;
     tcg_insn_unit i1, i2;
 
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h            | 3 +++
 tcg/aarch64/tcg-target.h     | 4 ----
 tcg/arm/tcg-target.h         | 5 -----
 tcg/i386/tcg-target.h        | 3 ---
 tcg/loongarch64/tcg-target.h | 3 ---
 tcg/mips/tcg-target.h        | 5 -----
 tcg/ppc/tcg-target.h         | 4 ----
 tcg/riscv/tcg-target.h       | 4 ----
 tcg/s390x/tcg-target.h       | 4 ----
 tcg/sparc64/tcg-target.h     | 4 ----
 tcg/tci/tcg-target.h         | 4 ----
 11 files changed, 3 insertions(+), 40 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s);
 
 int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start);
 
+void tb_target_set_jmp_target(const TranslationBlock *, int,
+                              uintptr_t, uintptr_t);
+
 void tcg_set_frame(TCGContext *s, TCGReg reg, intptr_t start, intptr_t size);
 
 TCGTemp *tcg_global_mem_new_internal(TCGType, TCGv_ptr,
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     0
-
-void tb_target_set_jmp_target(const TranslationBlock *, int,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     0
-
-/* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
 #define TCG_TARGET_extract_i64_valid(ofs, len) \
     (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
 
-void tb_target_set_jmp_target(const TranslationBlock *, int,
-                              uintptr_t, uintptr_t);
-
 /* This defines the natural memory order supported by this
  * architecture before guarantees made by various barrier
  * instructions.
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1
 
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_DEFAULT_MO (0)
 
 #define TCG_TARGET_NEED_LDST_LABELS
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
-/* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t)
-    QEMU_ERROR("code path is reachable");
-
 #define TCG_TARGET_NEED_LDST_LABELS
 
 #endif
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_bitsel_vec       have_vsx
 #define TCG_TARGET_HAS_cmpsel_vec       0
 
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
-
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_mulsh_i64        1
 #endif
 
-/* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_DEFAULT_MO (0)
 
 #define TCG_TARGET_NEED_LDST_LABELS
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
 #define TCG_TARGET_HAS_MEMORY_BSWAP   1
 
 #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw);
-
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_NEED_POOL_LABELS
 
 #endif
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
-/* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #endif /* TCG_TARGET_H */
-- 
2.34.1

Install empty versions for !TCG_TARGET_HAS_direct_jump hosts.

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* Always indirect, nothing to do */
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* Always indirect, nothing to do */
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* Always indirect, nothing to do */
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* Always indirect, nothing to do */
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
-- 
2.34.1

We now have the option to generate direct or indirect
goto_tb depending on the dynamic displacement, thus
the define is no longer necessary or completely accurate.

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_muls2_i64        0
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1
-#define TCG_TARGET_HAS_direct_jump      1
 
 #define TCG_TARGET_HAS_v64              1
 #define TCG_TARGET_HAS_v128             1
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_mulsh_i32        0
 #define TCG_TARGET_HAS_div_i32          use_idiv_instructions
 #define TCG_TARGET_HAS_rem_i32          0
-#define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #define TCG_TARGET_HAS_v64              use_neon_instructions
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
 #define TCG_TARGET_HAS_muls2_i32        1
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_direct_jump      1
 
 #if TCG_TARGET_REG_BITS == 64
 /* Keep target addresses zero-extended in a register.  */
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_clz_i32          1
 #define TCG_TARGET_HAS_ctz_i32          1
 #define TCG_TARGET_HAS_ctpop_i32        0
-#define TCG_TARGET_HAS_direct_jump      1
 #define TCG_TARGET_HAS_brcond2          0
 #define TCG_TARGET_HAS_setcond2         0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_muluh_i32        1
 #define TCG_TARGET_HAS_mulsh_i32        1
 #define TCG_TARGET_HAS_bswap32_i32      1
-#define TCG_TARGET_HAS_direct_jump      0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_add2_i32         0
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_muls2_i32        0
 #define TCG_TARGET_HAS_muluh_i32        1
 #define TCG_TARGET_HAS_mulsh_i32        1
-#define TCG_TARGET_HAS_direct_jump      1
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #if TCG_TARGET_REG_BITS == 64
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_clz_i32          0
 #define TCG_TARGET_HAS_ctz_i32          0
 #define TCG_TARGET_HAS_ctpop_i32        0
-#define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_brcond2          1
 #define TCG_TARGET_HAS_setcond2         1
 #define TCG_TARGET_HAS_qemu_st8_i32     0
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
 #define TCG_TARGET_HAS_mulsh_i32      0
 #define TCG_TARGET_HAS_extrl_i64_i32  0
 #define TCG_TARGET_HAS_extrh_i64_i32  0
-#define TCG_TARGET_HAS_direct_jump    1
 #define TCG_TARGET_HAS_qemu_st8_i32   0
 
 #define TCG_TARGET_HAS_div2_i64       1
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
 #define TCG_TARGET_HAS_muls2_i32        1
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_direct_jump      1
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #define TCG_TARGET_HAS_extrl_i64_i32    1
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_TARGET_HAS_muls2_i32        1
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #if TCG_TARGET_REG_BITS == 64
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
 
 void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
 {
+    /*
+     * Get the rx view of the structure, from which we find the
+     * executable code address, and tb_target_set_jmp_target can
+     * produce a pc-relative displacement to jmp_target_addr[n].
+     */
+    const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
+    uintptr_t offset = tb->jmp_insn_offset[n];
+    uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
+    uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
+
     tb->jmp_target_addr[n] = addr;
-    if (TCG_TARGET_HAS_direct_jump) {
-        /*
-         * Get the rx view of the structure, from which we find the
-         * executable code address, and tb_target_set_jmp_target can
-         * produce a pc-relative displacement to jmp_target_addr[n].
-         */
-        const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
-        uintptr_t offset = tb->jmp_insn_offset[n];
-        uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
-        uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
-        tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
-    }
+    tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
 }
 
 static inline void tb_add_jump(TranslationBlock *tb, int n,
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
      * We will check for overflow at the end of the opcode loop in
      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
      */
-    tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
     s->gen_tb->jmp_insn_offset[which] = tcg_current_code_size(s);
 }
 
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     intptr_t ptr, dif, dil;
     TCGReg base = TCG_REG_PC;
 
-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
     ptr = get_jmp_target_addr(s, which);
     dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
     dil = sextract32(dif, 0, 12);
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
     /* indirect jump method */
-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
                get_jmp_target_addr(s, which));
     tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
     /* indirect jump method */
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
                get_jmp_target_addr(s, which));
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
                               uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
+    if (!HAVE_FACILITY(GEN_INST_EXT)) {
+        return;
+    }
     /* patch the branch destination */
     uintptr_t addr = tb->jmp_target_addr[n];
     intptr_t disp = addr - (jmp_rx - 2);
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
     /* indirect jump method. */
     tcg_out_op_p(s, INDEX_op_goto_tb, (void *)get_jmp_target_addr(s, which));
     set_jmp_reset_offset(s, which);
-- 
2.34.1

The old implementation replaces two insns, swapping between

b	<dest>
	nop
	br	x30
and
	adrp	x30, <dest>
	addi	x30, x30, lo12:<dest>
	br	x30

There is a race condition in which a thread could be stopped at
the PC of the second insn, and when restarted does not see the
complete address computation and branches to nowhere.

The new implemetation replaces only one insn, swapping between

b	<dest>
	br	tmp
and
	ldr	tmp, <jmp_addr>
	br	tmp

Reported-by: hev <r@hev.cc>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.h     |  2 +-
 tcg/aarch64/tcg-target.c.inc | 66 +++++++++++++++---------------------
 2 files changed, 29 insertions(+), 39 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 
 #define TCG_TARGET_INSN_UNIT_SIZE  4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
-#define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
+#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 
 typedef enum {
     TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
     tcg_out_call_int(s, target);
 }
 
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-{
-    uintptr_t addr = tb->jmp_target_addr[n];
-    tcg_insn_unit i1, i2;
-    TCGType rt = TCG_TYPE_I64;
-    TCGReg  rd = TCG_REG_TMP;
-    uint64_t pair;
-
-    ptrdiff_t offset = addr - jmp_rx;
-
-    if (offset == sextract64(offset, 0, 26)) {
-        i1 = I3206_B | ((offset >> 2) & 0x3ffffff);
-        i2 = NOP;
-    } else {
-        offset = (addr >> 12) - (jmp_rx >> 12);
-
-        /* patch ADRP */
-        i1 = I3406_ADRP | (offset & 3) << 29 | (offset & 0x1ffffc) << (5 - 2) | rd;
-        /* patch ADDI */
-        i2 = I3401_ADDI | rt << 31 | (addr & 0xfff) << 10 | rd << 5 | rd;
-    }
-    pair = (uint64_t)i2 << 32 | i1;
-    qatomic_set((uint64_t *)jmp_rw, pair);
-    flush_idcache_range(jmp_rx, jmp_rw, 8);
-}
-
 static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
 {
     if (!l->has_value) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
     /*
-     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
-     * write can be used to patch the target address.
+     * Direct branch, or indirect address load, will be patched
+     * by tb_target_set_jmp_target.  Assert indirect load offset
+     * in range early, regardless of direct branch distance.
      */
-    if ((uintptr_t)s->code_ptr & 7) {
-        tcg_out32(s, NOP);
-    }
+    intptr_t i_off = tcg_pcrel_diff(s, (void *)get_jmp_target_addr(s, which));
+    tcg_debug_assert(i_off == sextract64(i_off, 0, 21));
+
     set_jmp_insn_offset(s, which);
-    /*
-     * actual branch destination will be patched by
-     * tb_target_set_jmp_target later
-     */
-    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
-    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
+    tcg_out32(s, I3206_B);
     tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    uintptr_t d_addr = tb->jmp_target_addr[n];
+    ptrdiff_t d_offset = d_addr - jmp_rx;
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or indirect branch load. */
+    if (d_offset == sextract64(d_offset, 0, 28)) {
+        insn = deposit32(I3206_B, 0, 26, d_offset >> 2);
+    } else {
+        uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
+        ptrdiff_t i_offset = i_addr - jmp_rx;
+
+        /* Note that we asserted this in range in tcg_out_goto_tb. */
+        insn = deposit32(I3305_LDR | TCG_REG_TMP, 0, 5, i_offset >> 2);
+    }
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
-- 
2.34.1

The old ppc64 implementation replaces 2 or 4 insns, which leaves a race
condition in which a thread could be stopped at a PC in the middle of
the sequence, and when restarted does not see the complete address
computation and branches to nowhere.

The new implemetation replaces only one insn, swapping between

b       <dest>
and
	mtctr	r31

falling through to a general-case indirect branch.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.h     |   3 +-
 tcg/ppc/tcg-target.c.inc | 158 +++++++++++----------------------------
 2 files changed, 44 insertions(+), 117 deletions(-)

diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 
 #ifdef _ARCH_PPC64
 # define TCG_TARGET_REG_BITS  64
-# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
 #else
 # define TCG_TARGET_REG_BITS  32
-# define MAX_CODE_GEN_BUFFER_SIZE  (32 * MiB)
 #endif
+#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 
 #define TCG_TARGET_NB_REGS 64
 #define TCG_TARGET_INSN_UNIT_SIZE 4
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
     tcg_out32(s, insn);
 }
 
-static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2)
-{
-    if (HOST_BIG_ENDIAN) {
-        return (uint64_t)i1 << 32 | i2;
-    }
-    return (uint64_t)i2 << 32 | i1;
-}
-
-static inline void ppc64_replace2(uintptr_t rx, uintptr_t rw,
-                                  tcg_insn_unit i0, tcg_insn_unit i1)
-{
-#if TCG_TARGET_REG_BITS == 64
-    qatomic_set((uint64_t *)rw, make_pair(i0, i1));
-    flush_idcache_range(rx, rw, 8);
-#else
-    qemu_build_not_reached();
-#endif
-}
-
-static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
-                                  tcg_insn_unit i0, tcg_insn_unit i1,
-                                  tcg_insn_unit i2, tcg_insn_unit i3)
-{
-    uint64_t p[2];
-
-    p[!HOST_BIG_ENDIAN] = make_pair(i0, i1);
-    p[HOST_BIG_ENDIAN] = make_pair(i2, i3);
-
-    /*
-     * There's no convenient way to get the compiler to allocate a pair
-     * of registers at an even index, so copy into r6/r7 and clobber.
-     */
-    asm("mr  %%r6, %1\n\t"
-        "mr  %%r7, %2\n\t"
-        "stq %%r6, %0"
-        : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7");
-    flush_idcache_range(rx, rw, 16);
-}
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-{
-    tcg_insn_unit i0, i1, i2, i3;
-    uintptr_t addr = tb->jmp_target_addr[n];
-    intptr_t tb_diff = addr - (uintptr_t)tb->tc.ptr;
-    intptr_t br_diff = addr - (jmp_rx + 4);
-    intptr_t lo, hi;
-
-    if (TCG_TARGET_REG_BITS == 32) {
-        intptr_t diff = addr - jmp_rx;
-        tcg_debug_assert(in_range_b(diff));
-        qatomic_set((uint32_t *)jmp_rw, B | (diff & 0x3fffffc));
-        flush_idcache_range(jmp_rx, jmp_rw, 4);
-        return;
-    }
-
-    /*
-     * For 16-bit displacements, we can use a single add + branch.
-     * This happens quite often.
-     */
-    if (tb_diff == (int16_t)tb_diff) {
-        i0 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
-        i1 = B | (br_diff & 0x3fffffc);
-        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
-        return;
-    }
-
-    lo = (int16_t)tb_diff;
-    hi = (int32_t)(tb_diff - lo);
-    assert(tb_diff == hi + lo);
-    i0 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
-    i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
-
-    /*
-     * Without stq from 2.07, we can only update two insns,
-     * and those must be the ones that load the target address.
-     */
-    if (!have_isa_2_07) {
-        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
-        return;
-    }
-
-    /*
-     * For 26-bit displacements, we can use a direct branch.
-     * Otherwise we still need the indirect branch, which we
-     * must restore after a potential direct branch write.
-     */
-    br_diff -= 4;
-    if (in_range_b(br_diff)) {
-        i2 = B | (br_diff & 0x3fffffc);
-        i3 = NOP;
-    } else {
-        i2 = MTSPR | RS(TCG_REG_TB) | CTR;
-        i3 = BCCTR | BO_ALWAYS;
-    }
-    ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3);
-}
-
 static void tcg_out_call_int(TCGContext *s, int lk,
                              const tcg_insn_unit *target)
 {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    /* Direct jump. */
-    if (TCG_TARGET_REG_BITS == 64) {
-        /* Ensure the next insns are 8 or 16-byte aligned. */
-        while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
-            tcg_out32(s, NOP);
-        }
+    uintptr_t ptr = get_jmp_target_addr(s, which);
+
+    if (USE_REG_TB) {
+        ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
+        tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
+    
+        /* Direct branch will be patched by tb_target_set_jmp_target. */
         set_jmp_insn_offset(s, which);
-        tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
+
+        /* When branch is out of range, fall through to indirect. */
+        tcg_out32(s, BCCTR | BO_ALWAYS);
+
+        /* For the unlinked case, need to reset TCG_REG_TB.  */
+        set_jmp_reset_offset(s, which);
+        tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
+                         -tcg_current_code_size(s));
+    } else {
+        /* Direct branch will be patched by tb_target_set_jmp_target. */
+        set_jmp_insn_offset(s, which);
+        tcg_out32(s, NOP);
+
+        /* When branch is out of range, fall through to indirect. */
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
+        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
+        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
         tcg_out32(s, BCCTR | BO_ALWAYS);
         set_jmp_reset_offset(s, which);
-        if (USE_REG_TB) {
-            /* For the unlinked case, need to reset TCG_REG_TB.  */
-            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
-                             -tcg_current_code_size(s));
-        }
-    } else {
-        set_jmp_insn_offset(s, which);
-        tcg_out32(s, B);
-        set_jmp_reset_offset(s, which);
     }
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t diff = addr - jmp_rx;
+    tcg_insn_unit insn;
+
+    if (in_range_b(diff)) {
+        insn = B | (diff & 0x3fffffc);
+    } else if (USE_REG_TB) {
+        insn = MTSPR | RS(TCG_REG_TB) | CTR;
+    } else {
+        insn = NOP;
+    }
+
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
-- 
2.34.1

This is always true for sparc64, so this is dead since 3a5f6805c7ca.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc64/tcg-target.c.inc | 62 ++++++++++++------------------------
 1 file changed, 21 insertions(+), 41 deletions(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #endif
 
 #define TCG_REG_TB  TCG_REG_I1
-#define USE_REG_TB  (sizeof(void *) > 4)
 
 static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_L0,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
     }
 
     /* A 13-bit constant relative to the TB.  */
-    if (!in_prologue && USE_REG_TB) {
+    if (!in_prologue) {
         test = tcg_tbrel_diff(s, (void *)arg);
         if (check_fit_ptr(test, 13)) {
             tcg_out_arithi(s, ret, TCG_REG_TB, test, ARITH_ADD);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
     }
 
     /* Use the constant pool, if possible. */
-    if (!in_prologue && USE_REG_TB) {
+    if (!in_prologue) {
         new_pool_label(s, arg, R_SPARC_13, s->code_ptr,
                        tcg_tbrel_diff(s, NULL));
         tcg_out32(s, LDX | INSN_RD(ret) | INSN_RS1(TCG_REG_TB));
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
 #endif
 
     /* We choose TCG_REG_TB such that no move is required.  */
-    if (USE_REG_TB) {
-        QEMU_BUILD_BUG_ON(TCG_REG_TB != TCG_REG_I1);
-        tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);
-    }
+    QEMU_BUILD_BUG_ON(TCG_REG_TB != TCG_REG_I1);
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);
 
     tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I1, 0, JMPL);
     /* delay slot */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
         tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
         tcg_out_movi_imm13(s, TCG_REG_O0, a0);
         return;
-    } else if (USE_REG_TB) {
+    } else {
         intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
         if (check_fit_ptr(tb_diff, 13)) {
             tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
+    int c;
+
     /* Direct jump. */
-    if (USE_REG_TB) {
-        /* make sure the patch is 8-byte aligned.  */
-        if ((intptr_t)s->code_ptr & 4) {
-            tcg_out_nop(s);
-        }
-        set_jmp_insn_offset(s, which);
-        tcg_out_sethi(s, TCG_REG_T1, 0);
-        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-        tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-    } else {
-        set_jmp_insn_offset(s, which);
-        tcg_out32(s, CALL);
+    /* make sure the patch is 8-byte aligned.  */
+    if ((intptr_t)s->code_ptr & 4) {
         tcg_out_nop(s);
     }
+    set_jmp_insn_offset(s, which);
+    tcg_out_sethi(s, TCG_REG_T1, 0);
+    tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+    tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+    tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
     set_jmp_reset_offset(s, which);
 
     /*
      * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
      * to the beginning of this TB.
      */
-    if (USE_REG_TB) {
-        int c = -tcg_current_code_size(s);
-        if (check_fit_i32(c, 13)) {
-            tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
-            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-        }
+    c = -tcg_current_code_size(s);
+    if (check_fit_i32(c, 13)) {
+        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
+        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     switch (opc) {
     case INDEX_op_goto_ptr:
         tcg_out_arithi(s, TCG_REG_G0, a0, 0, JMPL);
-        if (USE_REG_TB) {
-            tcg_out_mov_delay(s, TCG_REG_TB, a0);
-        } else {
-            tcg_out_nop(s);
-        }
+        tcg_out_mov_delay(s, TCG_REG_TB, a0);
         break;
     case INDEX_op_br:
         tcg_out_bpcc(s, COND_A, BPCC_PT, arg_label(a0));
@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
     tcg_debug_assert(tb_disp == (int32_t)tb_disp);
     tcg_debug_assert(br_disp == (int32_t)br_disp);
 
-    if (!USE_REG_TB) {
-        qatomic_set((uint32_t *)jmp_rw,
-		    deposit32(CALL, 0, 30, br_disp >> 2));
-        flush_idcache_range(jmp_rx, jmp_rw, 4);
-        return;
-    }
-
     /* This does not exercise the range of the branch, but we do
        still need to be able to load the new value of TCG_REG_TB.
        But this does still happen quite often.  */
-- 
2.34.1

The old sparc64 implementation may replace two insns, which leaves
a race condition in which a thread could be stopped at a PC in the
middle of the sequence, and when restarted does not see the complete
address computation and branches to nowhere.

The new implemetation replaces only one insn, swapping between a
direct branch and a direct call.  The TCG_REG_TB register is loaded
from tb->jmp_target_addr[] in the delay slot.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc64/tcg-target.c.inc | 87 +++++++++++++++---------------------
 1 file changed, 37 insertions(+), 50 deletions(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    int c;
+    ptrdiff_t off = tcg_tbrel_diff(s, (void *)get_jmp_target_addr(s, which));
 
-    /* Direct jump. */
-    /* make sure the patch is 8-byte aligned.  */
-    if ((intptr_t)s->code_ptr & 4) {
-        tcg_out_nop(s);
-    }
+    /* Direct branch will be patched by tb_target_set_jmp_target. */
     set_jmp_insn_offset(s, which);
-    tcg_out_sethi(s, TCG_REG_T1, 0);
-    tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-    tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-    tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+    tcg_out32(s, CALL);
+    /* delay slot */
+    tcg_debug_assert(check_fit_ptr(off, 13));
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, TCG_REG_TB, off);
     set_jmp_reset_offset(s, which);
 
     /*
      * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
      * to the beginning of this TB.
      */
-    c = -tcg_current_code_size(s);
-    if (check_fit_i32(c, 13)) {
-        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
+    off = -tcg_current_code_size(s);
+    if (check_fit_i32(off, 13)) {
+        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, off, ARITH_ADD);
     } else {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, off);
         tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
     }
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t br_disp = (intptr_t)(addr - jmp_rx) >> 2;
+    tcg_insn_unit insn;
+
+    br_disp >>= 2;
+    if (check_fit_ptr(br_disp, 19)) {
+        /* ba,pt %icc, addr */
+        insn = deposit32(INSN_OP(0) | INSN_OP2(1) | INSN_COND(COND_A)
+                         | BPCC_ICC | BPCC_PT, 0, 19, br_disp);
+    } else if (check_fit_ptr(br_disp, 22)) {
+        /* ba addr */
+        insn = deposit32(INSN_OP(0) | INSN_OP2(2) | INSN_COND(COND_A),
+                         0, 22, br_disp);
+    } else {
+        /* The code_gen_buffer can't be larger than 2GB.  */
+        tcg_debug_assert(check_fit_ptr(br_disp, 30));
+        /* call addr */
+        insn = deposit32(CALL, 0, 30, br_disp);
+    }
+
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ void tcg_register_jit(const void *buf, size_t buf_size)
 {
     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
 }
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-{
-    uintptr_t addr = tb->jmp_target_addr[n];
-    intptr_t tb_disp = addr - (uintptr_t)tb->tc.ptr;
-    intptr_t br_disp = addr - jmp_rx;
-    tcg_insn_unit i1, i2;
-
-    /* We can reach the entire address space for ILP32.
-       For LP64, the code_gen_buffer can't be larger than 2GB.  */
-    tcg_debug_assert(tb_disp == (int32_t)tb_disp);
-    tcg_debug_assert(br_disp == (int32_t)br_disp);
-
-    /* This does not exercise the range of the branch, but we do
-       still need to be able to load the new value of TCG_REG_TB.
-       But this does still happen quite often.  */
-    if (check_fit_ptr(tb_disp, 13)) {
-        /* ba,pt %icc, addr */
-        i1 = (INSN_OP(0) | INSN_OP2(1) | INSN_COND(COND_A)
-              | BPCC_ICC | BPCC_PT | INSN_OFF19(br_disp));
-        i2 = (ARITH_ADD | INSN_RD(TCG_REG_TB) | INSN_RS1(TCG_REG_TB)
-              | INSN_IMM13(tb_disp));
-    } else if (tb_disp >= 0) {
-        i1 = SETHI | INSN_RD(TCG_REG_T1) | ((tb_disp & 0xfffffc00) >> 10);
-        i2 = (ARITH_OR | INSN_RD(TCG_REG_T1) | INSN_RS1(TCG_REG_T1)
-              | INSN_IMM13(tb_disp & 0x3ff));
-    } else {
-        i1 = SETHI | INSN_RD(TCG_REG_T1) | ((~tb_disp & 0xfffffc00) >> 10);
-        i2 = (ARITH_XOR | INSN_RD(TCG_REG_T1) | INSN_RS1(TCG_REG_T1)
-              | INSN_IMM13((tb_disp & 0x3ff) | -0x400));
-    }
-
-    qatomic_set((uint64_t *)jmp_rw, deposit64(i2, 32, 32, i1));
-    flush_idcache_range(jmp_rx, jmp_rw, 8);
-}
-- 
2.34.1

Now that tcg can handle direct and indirect goto_tb
simultaneously, we can optimistically leave space for
a direct branch and fall back to loading the pointer
from the TB for an indirect branch.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 52 ++++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     ARITH_BIC = 0xe << 21,
     ARITH_MVN = 0xf << 21,
 
+    INSN_B         = 0x0a000000,
+
     INSN_CLZ       = 0x016f0f10,
     INSN_RBIT      = 0x06ff0f30,
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 
 static void tcg_out_b_imm(TCGContext *s, ARMCond cond, int32_t offset)
 {
-    tcg_out32(s, (cond << 28) | 0x0a000000 |
+    tcg_out32(s, (cond << 28) | INSN_B |
                     (((offset - 8) >> 2) & 0x00ffffff));
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    /* Indirect jump method */
-    intptr_t ptr, dif, dil;
-    TCGReg base = TCG_REG_PC;
+    uintptr_t i_addr;
+    intptr_t i_disp;
 
-    ptr = get_jmp_target_addr(s, which);
-    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
-    dil = sextract32(dif, 0, 12);
-    if (dif != dil) {
+    /* Direct branch will be patched by tb_target_set_jmp_target. */
+    set_jmp_insn_offset(s, which);
+    tcg_out32(s, INSN_NOP);
+
+    /* When branch is out of range, fall through to indirect. */
+    i_addr = get_jmp_target_addr(s, which);
+    i_disp = tcg_pcrel_diff(s, (void *)i_addr) - 8;
+    tcg_debug_assert(i_disp < 0);
+    if (i_disp >= -0xfff) {
+        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, i_disp);
+    } else {
         /*
          * The TB is close, but outside the 12 bits addressable by
          * the load.  We can extend this to 20 bits with a sub of a
-         * shifted immediate from pc.  In the vastly unlikely event
-         * the code requires more than 1MB, we'll use 2 insns and
-         * be no worse off.
+         * shifted immediate from pc.
          */
-        base = TCG_REG_R0;
-        tcg_out_movi32(s, COND_AL, base, ptr - dil);
+        int h = -i_disp;
+        int l = h & 0xfff;
+
+        h = encode_imm_nofail(h - l);
+        tcg_out_dat_imm(s, COND_AL, ARITH_SUB, TCG_REG_R0, TCG_REG_PC, h);
+        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, l);
     }
-    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
     set_jmp_reset_offset(s, which);
 }
 
 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
                               uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
-    /* Always indirect, nothing to do */
+    uintptr_t addr = tb->jmp_target_addr[n];
+    ptrdiff_t offset = addr - (jmp_rx + 8);
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or fall through to indirect branch. */
+    if (offset == sextract64(offset, 0, 26)) {
+        /* B <addr> */
+        insn = deposit32((COND_AL << 28) | INSN_B, 0, 24, offset >> 2);
+    } else {
+        insn = INSN_NOP;
+    }
+
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-- 
2.34.1

Now that tcg can handle direct and indirect goto_tb simultaneously,
we can optimistically leave space for a direct branch and fall back
to loading the pointer from the TB for an indirect branch.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/riscv/tcg-target.c.inc | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    /* indirect jump method */
+    /* Direct branch will be patched by tb_target_set_jmp_target. */
+    set_jmp_insn_offset(s, which);
+    tcg_out32(s, OPC_JAL);
+
+    /* When branch is out of range, fall through to indirect. */
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
                get_jmp_target_addr(s, which));
     tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
                               uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
-    /* Always indirect, nothing to do */
+    uintptr_t addr = tb->jmp_target_addr[n];
+    ptrdiff_t offset = addr - jmp_rx;
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or fall through to indirect branch. */
+    if (offset == sextreg(offset, 0, 20)) {
+        insn = encode_uj(OPC_JAL, TCG_REG_ZERO, offset);
+    } else {
+        insn = OPC_NOP;
+    }
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-- 
2.34.1