Series comparison

-[PULL 00/16] tcg patch queue
+[PULL 00/22] tcg patch queue
-The following changes since commit 3e08b2b9cb64bff2b73fa9128c0e49bfcde0dd40:
+Second pull for this week, since this set is large enough by itself.
-  Merge remote-tracking branch 'remotes/philmd-gitlab/tags/edk2-next-20200121' into staging (2020-01-21 15:29:25 +0000)
 r~
 The following changes since commit 7c9236d6d61f30583d5d860097d88dbf0fe487bf:
   Merge tag 'pull-tcg-20230116' of https://gitlab.com/rth7680/qemu into staging (2023-01-17 10:24:16 +0000)
 are available in the Git repository at:
-  https://github.com/rth7680/qemu.git tags/pull-tcg-20200121
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230117
-for you to fetch changes up to 75fa376cdab5e5db2c7fdd107358e16f95503ac6:
+for you to fetch changes up to 493c9b19a7fb7f387c4fcf57d3836504d5242bf5:
-  scripts/git.orderfile: Display decodetree before C source (2020-01-21 15:26:09 -1000)
+  tcg/riscv: Implement direct branch for goto_tb (2023-01-17 22:36:17 +0000)
 ----------------------------------------------------------------
-Remove another limit to NB_MMU_MODES.
+tcg: Fix race conditions in (most) goto_tb implementations
 Fix compilation using uclibc.
 Fix defaulting of -accel parameters.
 Tidy cputlb basic routines.
 Adjust git.orderfile for decodetree.
 ----------------------------------------------------------------
-Carlos Santos (1):
+Richard Henderson (22):
-      util/cacheinfo: fix crash when compiling with uClibc
+      tcg: Split out tcg_out_exit_tb
       tcg/i386: Remove unused goto_tb code for indirect jump
       tcg/ppc: Remove unused goto_tb code for indirect jump
       tcg/sparc64: Remove unused goto_tb code for indirect jump
       tcg: Replace asserts on tcg_jmp_insn_offset
       tcg: Introduce set_jmp_insn_offset
       tcg: Introduce get_jmp_target_addr
       tcg: Split out tcg_out_goto_tb
       tcg: Rename TB_JMP_RESET_OFFSET_INVALID to TB_JMP_OFFSET_INVALID
       tcg: Add gen_tb to TCGContext
       tcg: Add TranslationBlock.jmp_insn_offset
       tcg: Change tb_target_set_jmp_target arguments
       tcg: Move tb_target_set_jmp_target declaration to tcg.h
       tcg: Always define tb_target_set_jmp_target
       tcg: Remove TCG_TARGET_HAS_direct_jump
       tcg/aarch64: Reorg goto_tb implementation
       tcg/ppc: Reorg goto_tb implementation
       tcg/sparc64: Remove USE_REG_TB
       tcg/sparc64: Reorg goto_tb implementation
       tcg/arm: Implement direct branch for goto_tb
       tcg/riscv: Introduce OPC_NOP
       tcg/riscv: Implement direct branch for goto_tb
-Philippe Mathieu-Daudé (1):
+ include/exec/exec-all.h          |   5 +-
-      scripts/git.orderfile: Display decodetree before C source
+ include/tcg/tcg.h                |  14 ++-
+ tcg/aarch64/tcg-target.h         |   6 +-
-Richard Henderson (14):
+ tcg/arm/tcg-target.h             |   5 -
-      cputlb: Handle NB_MMU_MODES > TARGET_PAGE_BITS_MIN
+ tcg/i386/tcg-target.h            |   9 --
-      vl: Remove unused variable in configure_accelerators
+ tcg/loongarch64/tcg-target.h     |   3 -
-      vl: Reduce scope of variables in configure_accelerators
+ tcg/mips/tcg-target.h            |   5 -
-      vl: Remove useless test in configure_accelerators
+ tcg/ppc/tcg-target.h             |   7 +-
-      vl: Only choose enabled accelerators in configure_accelerators
+ tcg/riscv/tcg-target.h           |   4 -
-      cputlb: Merge tlb_table_flush_by_mmuidx into tlb_flush_one_mmuidx_locked
+ tcg/s390x/tcg-target.h           |  11 ---
-      cputlb: Make tlb_n_entries private to cputlb.c
+ tcg/sparc64/tcg-target.h         |   4 -
-      cputlb: Pass CPUTLBDescFast to tlb_n_entries and sizeof_tlb
+ tcg/tci/tcg-target.h             |   4 -
-      cputlb: Hoist tlb portions in tlb_mmu_resize_locked
+ accel/tcg/cpu-exec.c             |  21 ++--
-      cputlb: Hoist tlb portions in tlb_flush_one_mmuidx_locked
+ accel/tcg/translate-all.c        |  10 +-
-      cputlb: Split out tlb_mmu_flush_locked
+ tcg/tcg-op.c                     |  14 +--
-      cputlb: Partially merge tlb_dyn_init into tlb_init
+ tcg/tcg.c                        |  42 +++++---
-      cputlb: Initialize tlbs as flushed
+ tcg/aarch64/tcg-target.c.inc     | 106 ++++++++++-----------
-      cputlb: Hoist timestamp outside of loops over tlbs
+ tcg/arm/tcg-target.c.inc         |  89 +++++++++++------
+ tcg/i386/tcg-target.c.inc        |  68 +++++++------
- include/exec/cpu_ldst.h |   5 -
+ tcg/loongarch64/tcg-target.c.inc |  66 +++++++------
- accel/tcg/cputlb.c      | 287 +++++++++++++++++++++++++++++++++---------------
+ tcg/mips/tcg-target.c.inc        |  59 +++++++-----
- util/cacheinfo.c        |  10 +-
+ tcg/ppc/tcg-target.c.inc         | 193 ++++++++++++-------------------------
- vl.c                    |  27 +++--
+ tcg/riscv/tcg-target.c.inc       |  65 +++++++++----
- scripts/git.orderfile   |   3 +
+ tcg/s390x/tcg-target.c.inc       |  67 ++++++++-----
-files changed, 223 insertions(+), 109 deletions(-)
+ tcg/sparc64/tcg-target.c.inc     | 201 +++++++++++++++------------------------
+ tcg/tci/tcg-target.c.inc         |  31 +++---
 files changed, 528 insertions(+), 581 deletions(-)

-New patch
+[PULL 01/22] tcg: Split out tcg_out_exit_tb
+The INDEX_op_exit_tb opcode needs no register allocation.
+Split out a dedicated helper function for it.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tcg.c                        |  4 ++++
+ tcg/aarch64/tcg-target.c.inc     | 22 ++++++++++--------
+ tcg/arm/tcg-target.c.inc         | 11 +++++----
+ tcg/i386/tcg-target.c.inc        | 21 +++++++++--------
+ tcg/loongarch64/tcg-target.c.inc | 22 ++++++++++--------
+ tcg/mips/tcg-target.c.inc        | 33 +++++++++++++--------------
+ tcg/ppc/tcg-target.c.inc         | 11 +++++----
+ tcg/riscv/tcg-target.c.inc       | 22 ++++++++++--------
+ tcg/s390x/tcg-target.c.inc       | 23 ++++++++++---------
+ tcg/sparc64/tcg-target.c.inc     | 39 +++++++++++++++++---------------
+ tcg/tci/tcg-target.c.inc         | 10 ++++----
+files changed, 121 insertions(+), 97 deletions(-)
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
+ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
+ static void tcg_out_movi(TCGContext *s, TCGType type,
+                          TCGReg ret, tcg_target_long arg);
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS]);
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
+         case INDEX_op_call:
+             tcg_reg_alloc_call(s, op);
+             break;
++        case INDEX_op_exit_tb:
++            tcg_out_exit_tb(s, op->args[0]);
++            break;
+         case INDEX_op_dup2_vec:
+             if (tcg_reg_alloc_dup2(s, op)) {
+                 break;
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/aarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
+ static const tcg_insn_unit *tb_ret_addr;
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    /* Reuse the zeroing that exists for goto_ptr.  */
++    if (a0 == 0) {
++        tcg_out_goto_long(s, tcg_code_gen_epilogue);
++    } else {
++        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
++        tcg_out_goto_long(s, tb_ret_addr);
++    }
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+ #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        /* Reuse the zeroing that exists for goto_ptr.  */
+-        if (a0 == 0) {
+-            tcg_out_goto_long(s, tcg_code_gen_epilogue);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
+-            tcg_out_goto_long(s, tb_ret_addr);
+-        }
+-        break;
+-
+     case INDEX_op_goto_tb:
+         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+         /*
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         g_assert_not_reached();
+     }
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+ static void tcg_out_epilogue(TCGContext *s);
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
++{
++    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, arg);
++    tcg_out_epilogue(s);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     int c;
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]);
+-        tcg_out_epilogue(s);
+-        break;
+     case INDEX_op_goto_tb:
+         {
+             /* Indirect jump method */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.c.inc
++++ b/tcg/i386/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+ #endif
+ }
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    /* Reuse the zeroing that exists for goto_ptr.  */
++    if (a0 == 0) {
++        tcg_out_jmp(s, tcg_code_gen_epilogue);
++    } else {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
++        tcg_out_jmp(s, tb_ret_addr);
++    }
++}
++
+ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                               const TCGArg args[TCG_MAX_OP_ARGS],
+                               const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     const_a2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        /* Reuse the zeroing that exists for goto_ptr.  */
+-        if (a0 == 0) {
+-            tcg_out_jmp(s, tcg_code_gen_epilogue);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
+-            tcg_out_jmp(s, tb_ret_addr);
+-        }
+-        break;
+     case INDEX_op_goto_tb:
+         if (s->tb_jmp_insn_offset) {
+             /* direct jump method */
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+ static const tcg_insn_unit *tb_ret_addr;
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    /* Reuse the zeroing that exists for goto_ptr.  */
++    if (a0 == 0) {
++        tcg_out_call_int(s, tcg_code_gen_epilogue, true);
++    } else {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
++        tcg_out_call_int(s, tb_ret_addr, true);
++    }
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     int c2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        /* Reuse the zeroing that exists for goto_ptr.  */
+-        if (a0 == 0) {
+-            tcg_out_call_int(s, tcg_code_gen_epilogue, true);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
+-            tcg_out_call_int(s, tb_ret_addr, true);
+-        }
+-        break;
+-
+     case INDEX_op_goto_tb:
+         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+         /*
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         g_assert_not_reached();
+     }
+diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/mips/tcg-target.c.inc
++++ b/tcg/mips/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_clz(TCGContext *s, MIPSInsn opcv2, MIPSInsn opcv6,
+     }
+ }
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    TCGReg b0 = TCG_REG_ZERO;
++
++    if (a0 & ~0xffff) {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, a0 & ~0xffff);
++        b0 = TCG_REG_V0;
++    }
++    if (!tcg_out_opc_jmp(s, OPC_J, tb_ret_addr)) {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, (uintptr_t)tb_ret_addr);
++        tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
++    }
++    tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     c2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        {
+-            TCGReg b0 = TCG_REG_ZERO;
+-
+-            a0 = (intptr_t)a0;
+-            if (a0 & ~0xffff) {
+-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, a0 & ~0xffff);
+-                b0 = TCG_REG_V0;
+-            }
+-            if (!tcg_out_opc_jmp(s, OPC_J, tb_ret_addr)) {
+-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0,
+-                             (uintptr_t)tb_ret_addr);
+-                tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
+-            }
+-            tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
+-        }
+-        break;
+     case INDEX_op_goto_tb:
+         /* indirect jump method */
+         tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.c.inc
++++ b/tcg/ppc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
+     tcg_out32(s, BCLR | BO_ALWAYS);
+ }
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
++{
++    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, arg);
++    tcg_out_b(s, 0, tcg_code_gen_epilogue);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     TCGArg a0, a1, a2;
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, args[0]);
+-        tcg_out_b(s, 0, tcg_code_gen_epilogue);
+-        break;
+     case INDEX_op_goto_tb:
+         if (s->tb_jmp_insn_offset) {
+             /* Direct jump. */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:   /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:   /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/riscv/tcg-target.c.inc
++++ b/tcg/riscv/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
+ static const tcg_insn_unit *tb_ret_addr;
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    /* Reuse the zeroing that exists for goto_ptr.  */
++    if (a0 == 0) {
++        tcg_out_call_int(s, tcg_code_gen_epilogue, true);
++    } else {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
++        tcg_out_call_int(s, tb_ret_addr, true);
++    }
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     int c2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        /* Reuse the zeroing that exists for goto_ptr.  */
+-        if (a0 == 0) {
+-            tcg_out_call_int(s, tcg_code_gen_epilogue, true);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
+-            tcg_out_call_int(s, tb_ret_addr, true);
+-        }
+-        break;
+-
+     case INDEX_op_goto_tb:
+         assert(s->tb_jmp_insn_offset == 0);
+         /* indirect jump method */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         g_assert_not_reached();
+     }
+diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/s390x/tcg-target.c.inc
++++ b/tcg/s390x/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
+ #endif
+ }
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    /* Reuse the zeroing that exists for goto_ptr.  */
++    if (a0 == 0) {
++        tgen_gotoi(s, S390_CC_ALWAYS, tcg_code_gen_epilogue);
++    } else {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, a0);
++        tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
++    }
++}
++
+ # define OP_32_64(x) \
+         case glue(glue(INDEX_op_,x),_i32): \
+         case glue(glue(INDEX_op_,x),_i64)
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     TCGArg a0, a1, a2;
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        /* Reuse the zeroing that exists for goto_ptr.  */
+-        a0 = args[0];
+-        if (a0 == 0) {
+-            tgen_gotoi(s, S390_CC_ALWAYS, tcg_code_gen_epilogue);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, a0);
+-            tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
+-        }
+-        break;
+-
+     case INDEX_op_goto_tb:
+         a0 = args[0];
+         /*
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc64/tcg-target.c.inc
++++ b/tcg/sparc64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
+ #endif /* CONFIG_SOFTMMU */
+ }
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    if (check_fit_ptr(a0, 13)) {
++        tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
++        tcg_out_movi_imm13(s, TCG_REG_O0, a0);
++        return;
++    } else if (USE_REG_TB) {
++        intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
++        if (check_fit_ptr(tb_diff, 13)) {
++            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
++            /* Note that TCG_REG_TB has been unwound to O1.  */
++            tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O1, tb_diff, ARITH_ADD);
++            return;
++        }
++    }
++    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I0, a0 & ~0x3ff);
++    tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
++    tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     c2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        if (check_fit_ptr(a0, 13)) {
+-            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+-            tcg_out_movi_imm13(s, TCG_REG_O0, a0);
+-            break;
+-        } else if (USE_REG_TB) {
+-            intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
+-            if (check_fit_ptr(tb_diff, 13)) {
+-                tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+-                /* Note that TCG_REG_TB has been unwound to O1.  */
+-                tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O1, tb_diff, ARITH_ADD);
+-                break;
+-            }
+-        }
+-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I0, a0 & ~0x3ff);
+-        tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+-        tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
+-        break;
+     case INDEX_op_goto_tb:
+         if (s->tb_jmp_insn_offset) {
+             /* direct jump method */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.c.inc
++++ b/tcg/tci/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *func,
+ # define CASE_64(x)
+ #endif
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
++{
++    tcg_out_op_p(s, INDEX_op_exit_tb, (void *)arg);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     TCGOpcode exts;
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        tcg_out_op_p(s, opc, (void *)args[0]);
+-        break;
+-
+     case INDEX_op_goto_tb:
+         tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+         /* indirect jump method. */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+--
+.34.1

-New patch
+[PULL 02/22] tcg/i386: Remove unused goto_tb code for indirect jump
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/i386/tcg-target.c.inc | 14 +++++---------
+file changed, 5 insertions(+), 9 deletions(-)
+diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.c.inc
++++ b/tcg/i386/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        if (s->tb_jmp_insn_offset) {
+-            /* direct jump method */
+-            int gap;
+-            /* jump displacement must be aligned for atomic patching;
++        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
++        {
++            /*
++             * Jump displacement must be aligned for atomic patching;
+              * see if we need to add extra nops before jump
+              */
+-            gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
++            int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
+             if (gap != 1) {
+                 tcg_out_nopn(s, gap - 1);
+             }
+             tcg_out8(s, OPC_JMP_long); /* jmp im */
+             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+             tcg_out32(s, 0);
+-        } else {
+-            /* indirect jump method */
+-            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
+-                                 (intptr_t)(s->tb_jmp_target_addr + a0));
+         }
+         set_jmp_reset_offset(s, a0);
+         break;
+--
+.34.1

-New patch
+[PULL 03/22] tcg/ppc: Remove unused goto_tb code for indirect jump
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/ppc/tcg-target.c.inc | 32 +++++++++++++-------------------
+file changed, 13 insertions(+), 19 deletions(-)
+diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.c.inc
++++ b/tcg/ppc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        if (s->tb_jmp_insn_offset) {
+-            /* Direct jump. */
+-            if (TCG_TARGET_REG_BITS == 64) {
+-                /* Ensure the next insns are 8 or 16-byte aligned. */
+-                while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
+-                    tcg_out32(s, NOP);
+-                }
+-                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+-                tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+-                tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+-            } else {
+-                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+-                tcg_out32(s, B);
+-                s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+-                break;
++        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
++        /* Direct jump. */
++        if (TCG_TARGET_REG_BITS == 64) {
++            /* Ensure the next insns are 8 or 16-byte aligned. */
++            while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
++                tcg_out32(s, NOP);
+             }
++            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
++            tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
++            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+         } else {
+-            /* Indirect jump. */
+-            tcg_debug_assert(s->tb_jmp_insn_offset == NULL);
+-            tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, 0,
+-                       (intptr_t)(s->tb_jmp_insn_offset + args[0]));
++            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
++            tcg_out32(s, B);
++            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
++            break;
+         }
+         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
+         tcg_out32(s, BCCTR | BO_ALWAYS);
+--
+.34.1

-New patch
+[PULL 04/22] tcg/sparc64: Remove unused goto_tb code for indirect jump
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/sparc64/tcg-target.c.inc | 41 +++++++++++-------------------------
+file changed, 12 insertions(+), 29 deletions(-)
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc64/tcg-target.c.inc
++++ b/tcg/sparc64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
+     return false;
+ }
+-static void tcg_out_ld_ptr(TCGContext *s, TCGReg ret, const void *arg)
+-{
+-    intptr_t diff = tcg_tbrel_diff(s, arg);
+-    if (USE_REG_TB && check_fit_ptr(diff, 13)) {
+-        tcg_out_ld(s, TCG_TYPE_PTR, ret, TCG_REG_TB, diff);
+-        return;
+-    }
+-    tcg_out_movi(s, TCG_TYPE_PTR, ret, (uintptr_t)arg & ~0x3ff);
+-    tcg_out_ld(s, TCG_TYPE_PTR, ret, ret, (uintptr_t)arg & 0x3ff);
+-}
+-
+ static void tcg_out_sety(TCGContext *s, TCGReg rs)
+ {
+     tcg_out32(s, WRY | INSN_RS1(TCG_REG_G0) | INSN_RS2(rs));
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        if (s->tb_jmp_insn_offset) {
+-            /* direct jump method */
+-            if (USE_REG_TB) {
+-                /* make sure the patch is 8-byte aligned.  */
+-                if ((intptr_t)s->code_ptr & 4) {
+-                    tcg_out_nop(s);
+-                }
+-                s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+-                tcg_out_sethi(s, TCG_REG_T1, 0);
+-                tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+-                tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+-                tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+-            } else {
+-                s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+-                tcg_out32(s, CALL);
++        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
++        /* Direct jump. */
++        if (USE_REG_TB) {
++            /* make sure the patch is 8-byte aligned.  */
++            if ((intptr_t)s->code_ptr & 4) {
+                 tcg_out_nop(s);
+             }
++            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
++            tcg_out_sethi(s, TCG_REG_T1, 0);
++            tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
++            tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
++            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+         } else {
+-            /* indirect jump method */
+-            tcg_out_ld_ptr(s, TCG_REG_TB, s->tb_jmp_target_addr + a0);
+-            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_TB, 0, JMPL);
++            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
++            tcg_out32(s, CALL);
+             tcg_out_nop(s);
+         }
+         set_jmp_reset_offset(s, a0);
+--
+.34.1

-[PULL 16/16] scripts/git.orderfile: Display decodetree before C source
+[PULL 05/22] tcg: Replace asserts on tcg_jmp_insn_offset
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Test TCG_TARGET_HAS_direct_jump instead of testing an
 implementation pointer.
-To avoid scrolling each instruction when reviewing tcg
-helpers written for the decodetree script, display the
-.decode files (similar to header declarations) before
-the C source (implementation of previous declarations).
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Message-Id: <20191230082856.30556-1-philmd@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- scripts/git.orderfile | 3 +++
+ tcg/aarch64/tcg-target.c.inc     | 2 +-
-file changed, 3 insertions(+)
+ tcg/arm/tcg-target.c.inc         | 2 +-
  tcg/loongarch64/tcg-target.c.inc | 2 +-
  tcg/mips/tcg-target.c.inc        | 2 +-
  tcg/riscv/tcg-target.c.inc       | 2 +-
  tcg/tci/tcg-target.c.inc         | 2 +-
 files changed, 6 insertions(+), 6 deletions(-)
-diff --git a/scripts/git.orderfile b/scripts/git.orderfile
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/scripts/git.orderfile
+--- a/tcg/aarch64/tcg-target.c.inc
-+++ b/scripts/git.orderfile
++++ b/tcg/aarch64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ qga/*.json
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
- # headers
- *.h
+     switch (opc) {
+     case INDEX_op_goto_tb:
-+# decoding tree specification
+-        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
-+*.decode
++        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
-+
+         /*
- # code
+          * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
- *.c
+          * write can be used to patch the target address.
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
              intptr_t ptr, dif, dil;
              TCGReg base = TCG_REG_PC;
 -            tcg_debug_assert(s->tb_jmp_insn_offset == 0);
 +            qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
              ptr = (intptr_t)tcg_splitwx_to_rx(s->tb_jmp_target_addr + args[0]);
              dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
              dil = sextract32(dif, 0, 12);
 diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.c.inc
 +++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
 +        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
          /*
           * Ensure that patch area is 8-byte aligned so that an
           * atomic write can be used to patch the target address.
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
          /* indirect jump method */
 -        tcg_debug_assert(s->tb_jmp_insn_offset == 0);
 +        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
          tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
                     (uintptr_t)(s->tb_jmp_target_addr + a0));
          tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        assert(s->tb_jmp_insn_offset == 0);
 +        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
          /* indirect jump method */
          tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
                     (uintptr_t)(s->tb_jmp_target_addr + a0));
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        tcg_debug_assert(s->tb_jmp_insn_offset == 0);
 +        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
          /* indirect jump method. */
          tcg_out_op_p(s, opc, s->tb_jmp_target_addr + args[0]);
          set_jmp_reset_offset(s, args[0]);
 --
-.20.1
+.34.1

-[PULL 06/16] vl: Only choose enabled accelerators in configure_accelerators
+[PULL 06/22] tcg: Introduce set_jmp_insn_offset
-By choosing "tcg:kvm" when kvm is not enabled, we generate
+Similar to the existing set_jmp_reset_offset.  Move any assert for
-an incorrect warning: "invalid accelerator kvm".
+TCG_TARGET_HAS_direct_jump into the new function (which now cannot
 be build-time).  Will be unused if TCG_TARGET_HAS_direct_jump is
 constant 0, but we can't test for constant in the preprocessor,
 so just mark it G_GNUC_UNUSED.
-At the same time, use g_str_has_suffix rather than open-coding
-the same operation.
-Presumably the inverse is also true with --disable-tcg.
-Fixes: 28a0961757fc
-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed by: Aleksandar Markovic <amarkovic@wavecomp.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- vl.c | 21 +++++++++++++--------
+ tcg/tcg.c                        | 10 ++++++++++
-file changed, 13 insertions(+), 8 deletions(-)
+ tcg/aarch64/tcg-target.c.inc     |  3 +--
  tcg/i386/tcg-target.c.inc        |  3 +--
  tcg/loongarch64/tcg-target.c.inc |  3 +--
  tcg/ppc/tcg-target.c.inc         |  7 +++----
  tcg/s390x/tcg-target.c.inc       |  2 +-
  tcg/sparc64/tcg-target.c.inc     |  5 ++---
 files changed, 19 insertions(+), 14 deletions(-)
-diff --git a/vl.c b/vl.c
+diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/vl.c
+--- a/tcg/tcg.c
-+++ b/vl.c
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ static void configure_accelerators(const char *progname)
+@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
+     s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
-         if (accel == NULL) {
+ }
-             /* Select the default accelerator */
--            if (!accel_find("tcg") && !accel_find("kvm")) {
++static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
--                error_report("No accelerator selected and"
++{
--                             " no default accelerator available");
++    /*
--                exit(1);
++     * We will check for overflow at the end of the opcode loop in
--            } else {
++     * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
--                int pnlen = strlen(progname);
++     */
--                if (pnlen >= 3 && g_str_equal(&progname[pnlen - 3], "kvm")) {
++    tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
-+            bool have_tcg = accel_find("tcg");
++    s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
-+            bool have_kvm = accel_find("kvm");
++}
 +
-+            if (have_tcg && have_kvm) {
+ /* Signal overflow, starting over with fewer guest insns. */
-+                if (g_str_has_suffix(progname, "kvm")) {
+ static G_NORETURN
-                     /* If the program name ends with "kvm", we prefer KVM */
+ void tcg_raise_tb_overflow(TCGContext *s)
-                     accel = "kvm:tcg";
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
-                 } else {
+index XXXXXXX..XXXXXXX 100644
-                     accel = "tcg:kvm";
+--- a/tcg/aarch64/tcg-target.c.inc
-                 }
++++ b/tcg/aarch64/tcg-target.c.inc
-+            } else if (have_kvm) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-+                accel = "kvm";
-+            } else if (have_tcg) {
+     switch (opc) {
-+                accel = "tcg";
+     case INDEX_op_goto_tb:
-+            } else {
+-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
-+                error_report("No accelerator selected and"
+         /*
-+                             " no default accelerator available");
+          * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
-+                exit(1);
+          * write can be used to patch the target address.
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
          if ((uintptr_t)s->code_ptr & 7) {
              tcg_out32(s, NOP);
          }
 -        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
 +        set_jmp_insn_offset(s, a0);
          /*
           * actual branch destination will be patched by
           * tb_target_set_jmp_target later
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
          {
              /*
               * Jump displacement must be aligned for atomic patching;
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                  tcg_out_nopn(s, gap - 1);
              }
+             tcg_out8(s, OPC_JMP_long); /* jmp im */
+-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
++            set_jmp_insn_offset(s, a0);
+             tcg_out32(s, 0);
          }
--
+         set_jmp_reset_offset(s, a0);
-         accel_list = g_strsplit(accel, ":", 0);
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
-         for (tmp = accel_list; *tmp; tmp++) {
+--- a/tcg/loongarch64/tcg-target.c.inc
 +++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
          /*
           * Ensure that patch area is 8-byte aligned so that an
           * atomic write can be used to patch the target address.
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
          if ((uintptr_t)s->code_ptr & 7) {
              tcg_out_nop(s);
          }
 -        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
 +        set_jmp_insn_offset(s, a0);
          /*
           * actual branch destination will be patched by
           * tb_target_set_jmp_target later
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
          /* Direct jump. */
          if (TCG_TARGET_REG_BITS == 64) {
              /* Ensure the next insns are 8 or 16-byte aligned. */
              while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
                  tcg_out32(s, NOP);
              }
 -            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
 +            set_jmp_insn_offset(s, args[0]);
              tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
              tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
          } else {
 -            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
 +            set_jmp_insn_offset(s, args[0]);
              tcg_out32(s, B);
 -            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
 +            set_jmp_reset_offset(s, args[0]);
              break;
          }
          tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
 diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.c.inc
 +++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
              tcg_out16(s, NOP);
          }
          tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
 -        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
 +        set_jmp_insn_offset(s, a0);
          s->code_ptr += 2;
          set_jmp_reset_offset(s, a0);
          break;
 diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.c.inc
 +++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
          /* Direct jump. */
          if (USE_REG_TB) {
              /* make sure the patch is 8-byte aligned.  */
              if ((intptr_t)s->code_ptr & 4) {
                  tcg_out_nop(s);
              }
 -            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
 +            set_jmp_insn_offset(s, a0);
              tcg_out_sethi(s, TCG_REG_T1, 0);
              tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
              tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
              tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
          } else {
 -            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
 +            set_jmp_insn_offset(s, a0);
              tcg_out32(s, CALL);
              tcg_out_nop(s);
          }
 --
-.20.1
+.34.1

-[PULL 14/16] cputlb: Initialize tlbs as flushed
+[PULL 07/22] tcg: Introduce get_jmp_target_addr
-There's little point in leaving these data structures half initialized,
+Similar to the existing set_jmp_reset_offset.  Include the
-and relying on a flush to be done during reset.
+rw->rx address space conversion done by arm and s390x, and
 forgotten by mips and riscv.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 5 +++--
+ tcg/tcg.c                  | 9 +++++++++
-file changed, 3 insertions(+), 2 deletions(-)
+ tcg/arm/tcg-target.c.inc   | 2 +-
  tcg/mips/tcg-target.c.inc  | 2 +-
  tcg/riscv/tcg-target.c.inc | 2 +-
  tcg/tci/tcg-target.c.inc   | 2 +-
 files changed, 13 insertions(+), 4 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/tcg/tcg.c
-+++ b/accel/tcg/cputlb.c
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_init(CPUTLBDesc *desc, CPUTLBDescFast *fast, int64_t now)
+@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
-     fast->mask = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
+     s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
      fast->table = g_new(CPUTLBEntry, n_entries);
      desc->iotlb = g_new(CPUIOTLBEntry, n_entries);
 +    tlb_mmu_flush_locked(desc, fast);
  }
- static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
++static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
-@@ -XXX,XX +XXX,XX @@ void tlb_init(CPUState *cpu)
++{
++    /*
-     qemu_spin_init(&env_tlb(env)->c.lock);
++     * Return the read-execute version of the pointer, for the benefit
++     * of any pc-relative addressing mode.
--    /* Ensure that cpu_reset performs a full flush.  */
++     */
--    env_tlb(env)->c.dirty = ALL_MMUIDX_BITS;
++    return (uintptr_t)tcg_splitwx_to_rx(&s->tb_jmp_target_addr[which]);
-+    /* All tlbs are initialized flushed. */
++}
-+    env_tlb(env)->c.dirty = 0;
++
+ /* Signal overflow, starting over with fewer guest insns. */
-     for (i = 0; i < NB_MMU_MODES; i++) {
+ static G_NORETURN
-         tlb_mmu_init(&env_tlb(env)->d[i], &env_tlb(env)->f[i], now);
+ void tcg_raise_tb_overflow(TCGContext *s)
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
              TCGReg base = TCG_REG_PC;
              qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 -            ptr = (intptr_t)tcg_splitwx_to_rx(s->tb_jmp_target_addr + args[0]);
 +            ptr = get_jmp_target_addr(s, args[0]);
              dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
              dil = sextract32(dif, 0, 12);
              if (dif != dil) {
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
          /* indirect jump method */
          qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
          tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
 -                   (uintptr_t)(s->tb_jmp_target_addr + a0));
 +                   get_jmp_target_addr(s, a0));
          tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
          tcg_out_nop(s);
          set_jmp_reset_offset(s, a0);
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
          qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
          /* indirect jump method */
          tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
 -                   (uintptr_t)(s->tb_jmp_target_addr + a0));
 +                   get_jmp_target_addr(s, a0));
          tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
          set_jmp_reset_offset(s, a0);
          break;
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_goto_tb:
          qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
          /* indirect jump method. */
 -        tcg_out_op_p(s, opc, s->tb_jmp_target_addr + args[0]);
 +        tcg_out_op_p(s, opc, (void *)get_jmp_target_addr(s, args[0]));
          set_jmp_reset_offset(s, args[0]);
          break;
 --
-.20.1
+.34.1

-[PULL 05/16] vl: Remove useless test in configure_accelerators
+[PULL 08/22] tcg: Split out tcg_out_goto_tb
-The result of g_strsplit is never NULL.
+The INDEX_op_goto_tb opcode needs no register allocation.
 Split out a dedicated helper function for it.
-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed by: Aleksandar Markovic <amarkovic@wavecomp.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- vl.c | 2 +-
+ tcg/tcg.c                        |  4 ++
-file changed, 1 insertion(+), 1 deletion(-)
+ tcg/aarch64/tcg-target.c.inc     | 40 ++++++++++---------
  tcg/arm/tcg-target.c.inc         | 49 ++++++++++++-----------
  tcg/i386/tcg-target.c.inc        | 33 ++++++++--------
  tcg/loongarch64/tcg-target.c.inc | 38 +++++++++---------
  tcg/mips/tcg-target.c.inc        | 21 +++++-----
  tcg/ppc/tcg-target.c.inc         | 52 ++++++++++++------------
  tcg/riscv/tcg-target.c.inc       | 20 +++++-----
  tcg/s390x/tcg-target.c.inc       | 31 ++++++++-------
  tcg/sparc64/tcg-target.c.inc     | 68 +++++++++++++++++---------------
  tcg/tci/tcg-target.c.inc         | 16 ++++----
 files changed, 199 insertions(+), 173 deletions(-)
-diff --git a/vl.c b/vl.c
+diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/vl.c
+--- a/tcg/tcg.c
-+++ b/vl.c
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ static void configure_accelerators(const char *progname)
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
+ static void tcg_out_movi(TCGContext *s, TCGType type,
-         accel_list = g_strsplit(accel, ":", 0);
+                          TCGReg ret, tcg_target_long arg);
+ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
--        for (tmp = accel_list; tmp && *tmp; tmp++) {
++static void tcg_out_goto_tb(TCGContext *s, int which);
-+        for (tmp = accel_list; *tmp; tmp++) {
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-             /*
+                        const TCGArg args[TCG_MAX_OP_ARGS],
-              * Filter invalid accelerators here, to prevent obscenities
+                        const int const_args[TCG_MAX_OP_ARGS]);
-              * such as "-machine accel=tcg,,thread=single".
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
          case INDEX_op_exit_tb:
              tcg_out_exit_tb(s, op->args[0]);
              break;
 +        case INDEX_op_goto_tb:
 +            tcg_out_goto_tb(s, op->args[0]);
 +            break;
          case INDEX_op_dup2_vec:
              if (tcg_reg_alloc_dup2(s, op)) {
                  break;
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      }
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /*
 +     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
 +     * write can be used to patch the target address.
 +     */
 +    if ((uintptr_t)s->code_ptr & 7) {
 +        tcg_out32(s, NOP);
 +    }
 +    set_jmp_insn_offset(s, which);
 +    /*
 +     * actual branch destination will be patched by
 +     * tb_target_set_jmp_target later
 +     */
 +    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
 +    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
 +    tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
  #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        /*
 -         * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
 -         * write can be used to patch the target address.
 -         */
 -        if ((uintptr_t)s->code_ptr & 7) {
 -            tcg_out32(s, NOP);
 -        }
 -        set_jmp_insn_offset(s, a0);
 -        /*
 -         * actual branch destination will be patched by
 -         * tb_target_set_jmp_target later
 -         */
 -        tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
 -        tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
 -        tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
 -        set_jmp_reset_offset(s, a0);
 -        break;
 -
      case INDEX_op_goto_ptr:
          tcg_out_insn(s, 3207, BR, a0);
          break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          g_assert_not_reached();
      }
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
      tcg_out_epilogue(s);
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /* Indirect jump method */
 +    intptr_t ptr, dif, dil;
 +    TCGReg base = TCG_REG_PC;
 +
 +    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 +    ptr = get_jmp_target_addr(s, which);
 +    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
 +    dil = sextract32(dif, 0, 12);
 +    if (dif != dil) {
 +        /*
 +         * The TB is close, but outside the 12 bits addressable by
 +         * the load.  We can extend this to 20 bits with a sub of a
 +         * shifted immediate from pc.  In the vastly unlikely event
 +         * the code requires more than 1MB, we'll use 2 insns and
 +         * be no worse off.
 +         */
 +        base = TCG_REG_R0;
 +        tcg_out_movi32(s, COND_AL, base, ptr - dil);
 +    }
 +    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      int c;
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        {
 -            /* Indirect jump method */
 -            intptr_t ptr, dif, dil;
 -            TCGReg base = TCG_REG_PC;
 -
 -            qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 -            ptr = get_jmp_target_addr(s, args[0]);
 -            dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
 -            dil = sextract32(dif, 0, 12);
 -            if (dif != dil) {
 -                /* The TB is close, but outside the 12 bits addressable by
 -                   the load.  We can extend this to 20 bits with a sub of a
 -                   shifted immediate from pc.  In the vastly unlikely event
 -                   the code requires more than 1MB, we'll use 2 insns and
 -                   be no worse off.  */
 -                base = TCG_REG_R0;
 -                tcg_out_movi32(s, COND_AL, base, ptr - dil);
 -            }
 -            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
 -            set_jmp_reset_offset(s, args[0]);
 -        }
 -        break;
      case INDEX_op_goto_ptr:
          tcg_out_b_reg(s, COND_AL, args[0]);
          break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      }
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /*
 +     * Jump displacement must be aligned for atomic patching;
 +     * see if we need to add extra nops before jump
 +     */
 +    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
 +    if (gap != 1) {
 +        tcg_out_nopn(s, gap - 1);
 +    }
 +    tcg_out8(s, OPC_JMP_long); /* jmp im */
 +    set_jmp_insn_offset(s, which);
 +    tcg_out32(s, 0);
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                                const TCGArg args[TCG_MAX_OP_ARGS],
                                const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
      const_a2 = const_args[2];
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        {
 -            /*
 -             * Jump displacement must be aligned for atomic patching;
 -             * see if we need to add extra nops before jump
 -             */
 -            int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
 -            if (gap != 1) {
 -                tcg_out_nopn(s, gap - 1);
 -            }
 -            tcg_out8(s, OPC_JMP_long); /* jmp im */
 -            set_jmp_insn_offset(s, a0);
 -            tcg_out32(s, 0);
 -        }
 -        set_jmp_reset_offset(s, a0);
 -        break;
      case INDEX_op_goto_ptr:
          /* jmp to the given host address (could be epilogue) */
          tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.c.inc
 +++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      }
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /*
 +     * Ensure that patch area is 8-byte aligned so that an
 +     * atomic write can be used to patch the target address.
 +     */
 +    if ((uintptr_t)s->code_ptr & 7) {
 +        tcg_out_nop(s);
 +    }
 +    set_jmp_insn_offset(s, which);
 +    /*
 +     * actual branch destination will be patched by
 +     * tb_target_set_jmp_target later
 +     */
 +    tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
 +    tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      int c2 = const_args[2];
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        /*
 -         * Ensure that patch area is 8-byte aligned so that an
 -         * atomic write can be used to patch the target address.
 -         */
 -        if ((uintptr_t)s->code_ptr & 7) {
 -            tcg_out_nop(s);
 -        }
 -        set_jmp_insn_offset(s, a0);
 -        /*
 -         * actual branch destination will be patched by
 -         * tb_target_set_jmp_target later
 -         */
 -        tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
 -        tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
 -        set_jmp_reset_offset(s, a0);
 -        break;
 -
      case INDEX_op_mb:
          tcg_out_mb(s, a0);
          break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          g_assert_not_reached();
      }
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /* indirect jump method */
 +    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 +    tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
 +               get_jmp_target_addr(s, which));
 +    tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
 +    tcg_out_nop(s);
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      c2 = const_args[2];
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        /* indirect jump method */
 -        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 -        tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
 -                   get_jmp_target_addr(s, a0));
 -        tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
 -        tcg_out_nop(s);
 -        set_jmp_reset_offset(s, a0);
 -        break;
      case INDEX_op_goto_ptr:
          /* jmp to the given host address (could be epilogue) */
          tcg_out_opc_reg(s, OPC_JR, 0, a0, 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
      tcg_out_b(s, 0, tcg_code_gen_epilogue);
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /* Direct jump. */
 +    if (TCG_TARGET_REG_BITS == 64) {
 +        /* Ensure the next insns are 8 or 16-byte aligned. */
 +        while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
 +            tcg_out32(s, NOP);
 +        }
 +        set_jmp_insn_offset(s, which);
 +        tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
 +        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
 +        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
 +        tcg_out32(s, BCCTR | BO_ALWAYS);
 +        set_jmp_reset_offset(s, which);
 +        if (USE_REG_TB) {
 +            /* For the unlinked case, need to reset TCG_REG_TB.  */
 +            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
 +                             -tcg_current_code_size(s));
 +        }
 +    } else {
 +        set_jmp_insn_offset(s, which);
 +        tcg_out32(s, B);
 +        set_jmp_reset_offset(s, which);
 +    }
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      TCGArg a0, a1, a2;
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        /* Direct jump. */
 -        if (TCG_TARGET_REG_BITS == 64) {
 -            /* Ensure the next insns are 8 or 16-byte aligned. */
 -            while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
 -                tcg_out32(s, NOP);
 -            }
 -            set_jmp_insn_offset(s, args[0]);
 -            tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
 -            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
 -        } else {
 -            set_jmp_insn_offset(s, args[0]);
 -            tcg_out32(s, B);
 -            set_jmp_reset_offset(s, args[0]);
 -            break;
 -        }
 -        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
 -        tcg_out32(s, BCCTR | BO_ALWAYS);
 -        set_jmp_reset_offset(s, args[0]);
 -        if (USE_REG_TB) {
 -            /* For the unlinked case, need to reset TCG_REG_TB.  */
 -            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
 -                             -tcg_current_code_size(s));
 -        }
 -        break;
      case INDEX_op_goto_ptr:
          tcg_out32(s, MTSPR | RS(args[0]) | CTR);
          if (USE_REG_TB) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:   /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:   /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      }
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 +    /* indirect jump method */
 +    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
 +               get_jmp_target_addr(s, which));
 +    tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      int c2 = const_args[2];
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 -        /* indirect jump method */
 -        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
 -                   get_jmp_target_addr(s, a0));
 -        tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
 -        set_jmp_reset_offset(s, a0);
 -        break;
 -
      case INDEX_op_goto_ptr:
          tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, a0, 0);
          break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          g_assert_not_reached();
      }
 diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.c.inc
 +++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      }
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /*
 +     * Branch displacement must be aligned for atomic patching;
 +     * see if we need to add extra nop before branch
 +     */
 +    if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
 +        tcg_out16(s, NOP);
 +    }
 +    tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
 +    set_jmp_insn_offset(s, which);
 +    s->code_ptr += 2;
 +    set_jmp_reset_offset(s, which);
 +}
 +
  # define OP_32_64(x) \
          case glue(glue(INDEX_op_,x),_i32): \
          case glue(glue(INDEX_op_,x),_i64)
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
      TCGArg a0, a1, a2;
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        a0 = args[0];
 -        /*
 -         * branch displacement must be aligned for atomic patching;
 -         * see if we need to add extra nop before branch
 -         */
 -        if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
 -            tcg_out16(s, NOP);
 -        }
 -        tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
 -        set_jmp_insn_offset(s, a0);
 -        s->code_ptr += 2;
 -        set_jmp_reset_offset(s, a0);
 -        break;
 -
      case INDEX_op_goto_ptr:
          a0 = args[0];
          tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, a0);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.c.inc
 +++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /* Direct jump. */
 +    if (USE_REG_TB) {
 +        /* make sure the patch is 8-byte aligned.  */
 +        if ((intptr_t)s->code_ptr & 4) {
 +            tcg_out_nop(s);
 +        }
 +        set_jmp_insn_offset(s, which);
 +        tcg_out_sethi(s, TCG_REG_T1, 0);
 +        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
 +        tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
 +        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
 +    } else {
 +        set_jmp_insn_offset(s, which);
 +        tcg_out32(s, CALL);
 +        tcg_out_nop(s);
 +    }
 +    set_jmp_reset_offset(s, which);
 +
 +    /*
 +     * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
 +     * to the beginning of this TB.
 +     */
 +    if (USE_REG_TB) {
 +        int c = -tcg_current_code_size(s);
 +        if (check_fit_i32(c, 13)) {
 +            tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
 +        } else {
 +            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
 +            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
 +        }
 +    }
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      c2 = const_args[2];
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        /* Direct jump. */
 -        if (USE_REG_TB) {
 -            /* make sure the patch is 8-byte aligned.  */
 -            if ((intptr_t)s->code_ptr & 4) {
 -                tcg_out_nop(s);
 -            }
 -            set_jmp_insn_offset(s, a0);
 -            tcg_out_sethi(s, TCG_REG_T1, 0);
 -            tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
 -            tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
 -            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
 -        } else {
 -            set_jmp_insn_offset(s, a0);
 -            tcg_out32(s, CALL);
 -            tcg_out_nop(s);
 -        }
 -        set_jmp_reset_offset(s, a0);
 -
 -        /* For the unlinked path of goto_tb, we need to reset
 -           TCG_REG_TB to the beginning of this TB.  */
 -        if (USE_REG_TB) {
 -            c = -tcg_current_code_size(s);
 -            if (check_fit_i32(c, 13)) {
 -                tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
 -            } else {
 -                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
 -                tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB,
 -                              TCG_REG_T1, ARITH_ADD);
 -            }
 -        }
 -        break;
      case INDEX_op_goto_ptr:
          tcg_out_arithi(s, TCG_REG_G0, a0, 0, JMPL);
          if (USE_REG_TB) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
      tcg_out_op_p(s, INDEX_op_exit_tb, (void *)arg);
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 +    /* indirect jump method. */
 +    tcg_out_op_p(s, INDEX_op_goto_tb, (void *)get_jmp_target_addr(s, which));
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      TCGOpcode exts;
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 -        /* indirect jump method. */
 -        tcg_out_op_p(s, opc, (void *)get_jmp_target_addr(s, args[0]));
 -        set_jmp_reset_offset(s, args[0]);
 -        break;
 -
      case INDEX_op_goto_ptr:
          tcg_out_op_r(s, opc, args[0]);
          break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 --
-.20.1
+.34.1

-[PULL 03/16] vl: Remove unused variable in configure_accelerators
+[PULL 09/22] tcg: Rename TB_JMP_RESET_OFFSET_INVALID to TB_JMP_OFFSET_INVALID
-The accel_initialised variable no longer has any setters.
+This will shortly be used for more than reset.
-Fixes: 6f6e1698a68c
-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed by: Aleksandar Markovic <amarkovic@wavecomp.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- vl.c | 3 +--
+ include/exec/exec-all.h   | 2 +-
-file changed, 1 insertion(+), 2 deletions(-)
+ accel/tcg/translate-all.c | 8 ++++----
  tcg/tcg.c                 | 4 ++--
 files changed, 7 insertions(+), 7 deletions(-)
-diff --git a/vl.c b/vl.c
+diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
 index XXXXXXX..XXXXXXX 100644
---- a/vl.c
+--- a/include/exec/exec-all.h
-+++ b/vl.c
++++ b/include/exec/exec-all.h
-@@ -XXX,XX +XXX,XX @@ static void configure_accelerators(const char *progname)
+@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
- {
+      * setting one of the jump targets (or patching the jump instruction). Only
-     const char *accel;
+      * two of such jumps are supported.
-     char **accel_list, **tmp;
+      */
--    bool accel_initialised = false;
++#define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
-     bool init_failed = false;
+     uint16_t jmp_reset_offset[2]; /* offset of original jump target */
+-#define TB_JMP_RESET_OFFSET_INVALID 0xffff /* indicates no jump generated */
-     qemu_opts_foreach(qemu_find_opts("icount"),
+     uintptr_t jmp_target_arg[2];  /* target address or offset */
-@@ -XXX,XX +XXX,XX @@ static void configure_accelerators(const char *progname)
+     /*
-         accel_list = g_strsplit(accel, ":", 0);
+diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+index XXXXXXX..XXXXXXX 100644
--        for (tmp = accel_list; !accel_initialised && tmp && *tmp; tmp++) {
+--- a/accel/tcg/translate-all.c
-+        for (tmp = accel_list; tmp && *tmp; tmp++) {
++++ b/accel/tcg/translate-all.c
-             /*
+@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
-              * Filter invalid accelerators here, to prevent obscenities
+     tb->jmp_dest[1] = (uintptr_t)NULL;
-              * such as "-machine accel=tcg,,thread=single".
      /* init original jump addresses which have been set during tcg_gen_code() */
 -    if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
 +    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
          tb_reset_jump(tb, 0);
      }
 -    if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
 +    if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
          tb_reset_jump(tb, 1);
      }
@@ -XXX,XX +XXX,XX @@ static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
      if (tb_page_addr1(tb) != -1) {
          tst->cross_page++;
      }
 -    if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
 +    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
          tst->direct_jmp_count++;
 -        if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
 +        if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
              tst->direct_jmp2_count++;
          }
      }
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
  #endif
      /* Initialize goto_tb jump offsets. */
 -    tb->jmp_reset_offset[0] = TB_JMP_RESET_OFFSET_INVALID;
 -    tb->jmp_reset_offset[1] = TB_JMP_RESET_OFFSET_INVALID;
 +    tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
 +    tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
      tcg_ctx->tb_jmp_reset_offset = tb->jmp_reset_offset;
      if (TCG_TARGET_HAS_direct_jump) {
          tcg_ctx->tb_jmp_insn_offset = tb->jmp_target_arg;
 --
-.20.1
+.34.1

-[PULL 01/16] cputlb: Handle NB_MMU_MODES > TARGET_PAGE_BITS_MIN
+[PULL 10/22] tcg: Add gen_tb to TCGContext
-In target/arm we will shortly have "too many" mmu_idx.
+This can replace four other variables that are references
-The current minimum barrier is caused by the way in which
+into the TranslationBlock structure.
 tlb_flush_page_by_mmuidx is coded.
-We can remove this limitation by allocating memory for
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 consumption by the worker.  Let us assume that this is
 the unlikely case, as will be the case for the majority
 of targets which have so far satisfied the BUILD_BUG_ON,
 and only allocate memory when necessary.
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 167 +++++++++++++++++++++++++++++++++++----------
+ include/tcg/tcg.h         | 11 +++--------
-file changed, 132 insertions(+), 35 deletions(-)
+ accel/tcg/translate-all.c |  2 +-
  tcg/tcg-op.c              | 14 +++++++-------
  tcg/tcg.c                 | 14 +++-----------
 files changed, 14 insertions(+), 27 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/include/tcg/tcg.h
-+++ b/accel/tcg/cputlb.c
++++ b/include/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_locked(CPUArchState *env, int midx,
+@@ -XXX,XX +XXX,XX @@ struct TCGContext {
      int nb_indirects;
      int nb_ops;
 -    /* goto_tb support */
 -    tcg_insn_unit *code_buf;
 -    uint16_t *tb_jmp_reset_offset; /* tb->jmp_reset_offset */
 -    uintptr_t *tb_jmp_insn_offset; /* tb->jmp_target_arg if direct_jump */
 -    uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_arg if !direct_jump */
 -
      TCGRegSet reserved_regs;
 -    uint32_t tb_cflags; /* cflags of the current TB */
      intptr_t current_frame_offset;
      intptr_t frame_start;
      intptr_t frame_end;
      TCGTemp *frame_temp;
 -    tcg_insn_unit *code_ptr;
 +    TranslationBlock *gen_tb;     /* tb for which code is being generated */
 +    tcg_insn_unit *code_buf;      /* pointer for start of tb */
 +    tcg_insn_unit *code_ptr;      /* pointer for running end of tb */
  #ifdef CONFIG_PROFILER
      TCGProfile prof;
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
      tb->trace_vcpu_dstate = *cpu->trace_dstate;
      tb_set_page_addr0(tb, phys_pc);
      tb_set_page_addr1(tb, -1);
 -    tcg_ctx->tb_cflags = cflags;
 +    tcg_ctx->gen_tb = tb;
   tb_overflow:
  #ifdef CONFIG_PROFILER
 diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op.c
 +++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_op6(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3,
  void tcg_gen_mb(TCGBar mb_type)
  {
 -    if (tcg_ctx->tb_cflags & CF_PARALLEL) {
 +    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {
          tcg_gen_op1(INDEX_op_mb, mb_type);
      }
  }
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_exit_tb(const TranslationBlock *tb, unsigned idx)
--/* As we are going to hijack the bottom bits of the page address for a
+ void tcg_gen_goto_tb(unsigned idx)
 - * mmuidx bit mask we need to fail to build if we can't do that
 +/**
 + * tlb_flush_page_by_mmuidx_async_0:
 + * @cpu: cpu on which to flush
 + * @addr: page of virtual address to flush
 + * @idxmap: set of mmu_idx to flush
 + *
 + * Helper for tlb_flush_page_by_mmuidx and friends, flush one page
 + * at @addr from the tlbs indicated by @idxmap from @cpu.
   */
 -QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS_MIN);
 -
 -static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
 -                                                run_on_cpu_data data)
 +static void tlb_flush_page_by_mmuidx_async_0(CPUState *cpu,
 +                                             target_ulong addr,
 +                                             uint16_t idxmap)
  {
-     CPUArchState *env = cpu->env_ptr;
+     /* We tested CF_NO_GOTO_TB in translator_use_goto_tb. */
--    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
+-    tcg_debug_assert(!(tcg_ctx->tb_cflags & CF_NO_GOTO_TB));
--    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
++    tcg_debug_assert(!(tcg_ctx->gen_tb->cflags & CF_NO_GOTO_TB));
--    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
+     /* We only support two chained exits.  */
-     int mmu_idx;
+     tcg_debug_assert(idx <= TB_EXIT_IDXMAX);
+ #ifdef CONFIG_DEBUG_TCG
-     assert_cpu_is_self(cpu);
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_lookup_and_goto_ptr(void)
+ {
--    tlb_debug("page addr:" TARGET_FMT_lx " mmu_map:0x%lx\n",
+     TCGv_ptr ptr;
--              addr, mmu_idx_bitmap);
-+    tlb_debug("page addr:" TARGET_FMT_lx " mmu_map:0x%x\n", addr, idxmap);
+-    if (tcg_ctx->tb_cflags & CF_NO_GOTO_PTR) {
++    if (tcg_ctx->gen_tb->cflags & CF_NO_GOTO_PTR) {
-     qemu_spin_lock(&env_tlb(env)->c.lock);
+         tcg_gen_exit_tb(NULL, 0);
-     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+         return;
 -        if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
 +        if ((idxmap >> mmu_idx) & 1) {
              tlb_flush_page_locked(env, mmu_idx, addr);
          }
      }
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
-     tb_flush_jmp_cache(cpu, addr);
+ {
      memop = tcg_canonicalize_memop(memop, 0, 0);
 -    if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) {
 +    if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) {
          TCGv_i32 t1 = tcg_temp_new_i32();
          TCGv_i32 t2 = tcg_temp_new_i32();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
  {
      memop = tcg_canonicalize_memop(memop, 1, 0);
 -    if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) {
 +    if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) {
          TCGv_i64 t1 = tcg_temp_new_i64();
          TCGv_i64 t2 = tcg_temp_new_i64();
@@ -XXX,XX +XXX,XX @@ static void * const table_##NAME[(MO_SIZE | MO_BSWAP) + 1] = {          \
  void tcg_gen_atomic_##NAME##_i32                                        \
      (TCGv_i32 ret, TCGv addr, TCGv_i32 val, TCGArg idx, MemOp memop)    \
  {                                                                       \
 -    if (tcg_ctx->tb_cflags & CF_PARALLEL) {                             \
 +    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {                        \
          do_atomic_op_i32(ret, addr, val, idx, memop, table_##NAME);     \
      } else {                                                            \
          do_nonatomic_op_i32(ret, addr, val, idx, memop, NEW,            \
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_##NAME##_i32                                        \
  void tcg_gen_atomic_##NAME##_i64                                        \
      (TCGv_i64 ret, TCGv addr, TCGv_i64 val, TCGArg idx, MemOp memop)    \
  {                                                                       \
 -    if (tcg_ctx->tb_cflags & CF_PARALLEL) {                             \
 +    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {                        \
          do_atomic_op_i64(ret, addr, val, idx, memop, table_##NAME);     \
      } else {                                                            \
          do_nonatomic_op_i64(ret, addr, val, idx, memop, NEW,            \
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
       * We will check for overflow at the end of the opcode loop in
       * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
       */
 -    s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
 +    s->gen_tb->jmp_reset_offset[which] = tcg_current_code_size(s);
  }
-+/**
+ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
-+ * tlb_flush_page_by_mmuidx_async_1:
+@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
-+ * @cpu: cpu on which to flush
+      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
-+ * @data: encoded addr + idxmap
+      */
-+ *
+     tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
-+ * Helper for tlb_flush_page_by_mmuidx and friends, called through
+-    s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
-+ * async_run_on_cpu.  The idxmap parameter is encoded in the page
++    s->gen_tb->jmp_target_arg[which] = tcg_current_code_size(s);
 + * offset of the target_ptr field.  This limits the set of mmu_idx
 + * that can be passed via this method.
 + */
 +static void tlb_flush_page_by_mmuidx_async_1(CPUState *cpu,
 +                                             run_on_cpu_data data)
 +{
 +    target_ulong addr_and_idxmap = (target_ulong) data.target_ptr;
 +    target_ulong addr = addr_and_idxmap & TARGET_PAGE_MASK;
 +    uint16_t idxmap = addr_and_idxmap & ~TARGET_PAGE_MASK;
 +
 +    tlb_flush_page_by_mmuidx_async_0(cpu, addr, idxmap);
 +}
 +
 +typedef struct {
 +    target_ulong addr;
 +    uint16_t idxmap;
 +} TLBFlushPageByMMUIdxData;
 +
 +/**
 + * tlb_flush_page_by_mmuidx_async_2:
 + * @cpu: cpu on which to flush
 + * @data: allocated addr + idxmap
 + *
 + * Helper for tlb_flush_page_by_mmuidx and friends, called through
 + * async_run_on_cpu.  The addr+idxmap parameters are stored in a
 + * TLBFlushPageByMMUIdxData structure that has been allocated
 + * specifically for this helper.  Free the structure when done.
 + */
 +static void tlb_flush_page_by_mmuidx_async_2(CPUState *cpu,
 +                                             run_on_cpu_data data)
 +{
 +    TLBFlushPageByMMUIdxData *d = data.host_ptr;
 +
 +    tlb_flush_page_by_mmuidx_async_0(cpu, d->addr, d->idxmap);
 +    g_free(d);
 +}
 +
  void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
  {
 -    target_ulong addr_and_mmu_idx;
 -
      tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%" PRIx16 "\n", addr, idxmap);
      /* This should already be page aligned */
 -    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
 -    addr_and_mmu_idx |= idxmap;
 +    addr &= TARGET_PAGE_MASK;
 -    if (!qemu_cpu_is_self(cpu)) {
 -        async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_work,
 -                         RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
 +    if (qemu_cpu_is_self(cpu)) {
 +        tlb_flush_page_by_mmuidx_async_0(cpu, addr, idxmap);
 +    } else if (idxmap < TARGET_PAGE_SIZE) {
 +        /*
 +         * Most targets have only a few mmu_idx.  In the case where
 +         * we can stuff idxmap into the low TARGET_PAGE_BITS, avoid
 +         * allocating memory for this operation.
 +         */
 +        async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_1,
 +                         RUN_ON_CPU_TARGET_PTR(addr | idxmap));
      } else {
 -        tlb_flush_page_by_mmuidx_async_work(
 -            cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
 +        TLBFlushPageByMMUIdxData *d = g_new(TLBFlushPageByMMUIdxData, 1);
 +
 +        /* Otherwise allocate a structure, freed by the worker.  */
 +        d->addr = addr;
 +        d->idxmap = idxmap;
 +        async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_2,
 +                         RUN_ON_CPU_HOST_PTR(d));
      }
  }
-@@ -XXX,XX +XXX,XX @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
+ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
- void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, target_ulong addr,
+@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
-                                        uint16_t idxmap)
+      * Return the read-execute version of the pointer, for the benefit
- {
+      * of any pc-relative addressing mode.
--    const run_on_cpu_func fn = tlb_flush_page_by_mmuidx_async_work;
+      */
--    target_ulong addr_and_mmu_idx;
+-    return (uintptr_t)tcg_splitwx_to_rx(&s->tb_jmp_target_addr[which]);
--
++    return (uintptr_t)tcg_splitwx_to_rx(s->gen_tb->jmp_target_arg + which);
      tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);
      /* This should already be page aligned */
 -    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
 -    addr_and_mmu_idx |= idxmap;
 +    addr &= TARGET_PAGE_MASK;
 -    flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
 -    fn(src_cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
 +    /*
 +     * Allocate memory to hold addr+idxmap only when needed.
 +     * See tlb_flush_page_by_mmuidx for details.
 +     */
 +    if (idxmap < TARGET_PAGE_SIZE) {
 +        flush_all_helper(src_cpu, tlb_flush_page_by_mmuidx_async_1,
 +                         RUN_ON_CPU_TARGET_PTR(addr | idxmap));
 +    } else {
 +        CPUState *dst_cpu;
 +
 +        /* Allocate a separate data block for each destination cpu.  */
 +        CPU_FOREACH(dst_cpu) {
 +            if (dst_cpu != src_cpu) {
 +                TLBFlushPageByMMUIdxData *d
 +                    = g_new(TLBFlushPageByMMUIdxData, 1);
 +
 +                d->addr = addr;
 +                d->idxmap = idxmap;
 +                async_run_on_cpu(dst_cpu, tlb_flush_page_by_mmuidx_async_2,
 +                                 RUN_ON_CPU_HOST_PTR(d));
 +            }
 +        }
 +    }
 +
 +    tlb_flush_page_by_mmuidx_async_0(src_cpu, addr, idxmap);
  }
- void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr)
+ /* Signal overflow, starting over with fewer guest insns. */
-@@ -XXX,XX +XXX,XX @@ void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
-                                               target_ulong addr,
+     /* Initialize goto_tb jump offsets. */
-                                               uint16_t idxmap)
+     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
- {
+     tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
--    const run_on_cpu_func fn = tlb_flush_page_by_mmuidx_async_work;
+-    tcg_ctx->tb_jmp_reset_offset = tb->jmp_reset_offset;
--    target_ulong addr_and_mmu_idx;
+-    if (TCG_TARGET_HAS_direct_jump) {
--
+-        tcg_ctx->tb_jmp_insn_offset = tb->jmp_target_arg;
-     tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);
+-        tcg_ctx->tb_jmp_target_addr = NULL;
+-    } else {
-     /* This should already be page aligned */
+-        tcg_ctx->tb_jmp_insn_offset = NULL;
--    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
+-        tcg_ctx->tb_jmp_target_addr = tb->jmp_target_arg;
--    addr_and_mmu_idx |= idxmap;
+-    }
-+    addr &= TARGET_PAGE_MASK;
+     tcg_reg_alloc_start(s);
--    flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
 -    async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
 +    /*
 +     * Allocate memory to hold addr+idxmap only when needed.
 +     * See tlb_flush_page_by_mmuidx for details.
 +     */
 +    if (idxmap < TARGET_PAGE_SIZE) {
 +        flush_all_helper(src_cpu, tlb_flush_page_by_mmuidx_async_1,
 +                         RUN_ON_CPU_TARGET_PTR(addr | idxmap));
 +        async_safe_run_on_cpu(src_cpu, tlb_flush_page_by_mmuidx_async_1,
 +                              RUN_ON_CPU_TARGET_PTR(addr | idxmap));
 +    } else {
 +        CPUState *dst_cpu;
 +        TLBFlushPageByMMUIdxData *d;
 +
 +        /* Allocate a separate data block for each destination cpu.  */
 +        CPU_FOREACH(dst_cpu) {
 +            if (dst_cpu != src_cpu) {
 +                d = g_new(TLBFlushPageByMMUIdxData, 1);
 +                d->addr = addr;
 +                d->idxmap = idxmap;
 +                async_run_on_cpu(dst_cpu, tlb_flush_page_by_mmuidx_async_2,
 +                                 RUN_ON_CPU_HOST_PTR(d));
 +            }
 +        }
 +
 +        d = g_new(TLBFlushPageByMMUIdxData, 1);
 +        d->addr = addr;
 +        d->idxmap = idxmap;
 +        async_safe_run_on_cpu(src_cpu, tlb_flush_page_by_mmuidx_async_2,
 +                              RUN_ON_CPU_HOST_PTR(d));
 +    }
  }
  void tlb_flush_page_all_cpus_synced(CPUState *src, target_ulong addr)
 --
-.20.1
+.34.1

-[PULL 07/16] cputlb: Merge tlb_table_flush_by_mmuidx into tlb_flush_one_mmuidx_locked
+[PULL 11/22] tcg: Add TranslationBlock.jmp_insn_offset
-There is only one caller for tlb_table_flush_by_mmuidx.  Place
+Stop overloading jmp_target_arg for both offset and address,
-the result at the earlier line number, due to an expected user
+depending on TCG_TARGET_HAS_direct_jump.  Instead, add a new
-in the near future.
+field to hold the jump insn offset and always set the target
 address in jmp_target_addr[].  This will allow a tcg backend
 to use either direct or indirect depending on displacement.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 19 +++++++------------
+ include/exec/exec-all.h | 3 ++-
-file changed, 7 insertions(+), 12 deletions(-)
+ accel/tcg/cpu-exec.c    | 5 ++---
  tcg/tcg.c               | 6 ++++--
 files changed, 8 insertions(+), 6 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/include/exec/exec-all.h
-+++ b/accel/tcg/cputlb.c
++++ b/include/exec/exec-all.h
-@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
+@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
       */
  #define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
      uint16_t jmp_reset_offset[2]; /* offset of original jump target */
 -    uintptr_t jmp_target_arg[2];  /* target address or offset */
 +    uint16_t jmp_insn_offset[2];  /* offset of direct jump insn */
 +    uintptr_t jmp_target_addr[2]; /* target address */
      /*
       * Each TB has a NULL-terminated list (jmp_list_head) of incoming jumps.
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
  void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
  {
 +    tb->jmp_target_addr[n] = addr;
      if (TCG_TARGET_HAS_direct_jump) {
 -        uintptr_t offset = tb->jmp_target_arg[n];
 +        uintptr_t offset = tb->jmp_insn_offset[n];
          uintptr_t tc_ptr = (uintptr_t)tb->tc.ptr;
          uintptr_t jmp_rx = tc_ptr + offset;
          uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
          tb_target_set_jmp_target(tc_ptr, jmp_rx, jmp_rw, addr);
 -    } else {
 -        tb->jmp_target_arg[n] = addr;
      }
  }
--static inline void tlb_table_flush_by_mmuidx(CPUArchState *env, int mmu_idx)
+diff --git a/tcg/tcg.c b/tcg/tcg.c
-+static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
+index XXXXXXX..XXXXXXX 100644
- {
+--- a/tcg/tcg.c
-     tlb_mmu_resize_locked(env, mmu_idx);
++++ b/tcg/tcg.c
--    memset(env_tlb(env)->f[mmu_idx].table, -1, sizeof_tlb(env, mmu_idx));
+@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
-     env_tlb(env)->d[mmu_idx].n_used_entries = 0;
+      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
-+    env_tlb(env)->d[mmu_idx].large_page_addr = -1;
+      */
-+    env_tlb(env)->d[mmu_idx].large_page_mask = -1;
+     tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
-+    env_tlb(env)->d[mmu_idx].vindex = 0;
+-    s->gen_tb->jmp_target_arg[which] = tcg_current_code_size(s);
-+    memset(env_tlb(env)->f[mmu_idx].table, -1, sizeof_tlb(env, mmu_idx));
++    s->gen_tb->jmp_insn_offset[which] = tcg_current_code_size(s);
 +    memset(env_tlb(env)->d[mmu_idx].vtable, -1,
 +           sizeof(env_tlb(env)->d[0].vtable));
  }
- static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
+ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
-@@ -XXX,XX +XXX,XX @@ void tlb_flush_counts(size_t *pfull, size_t *ppart, size_t *pelide)
+@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
-     *pelide = elide;
+      * Return the read-execute version of the pointer, for the benefit
       * of any pc-relative addressing mode.
       */
 -    return (uintptr_t)tcg_splitwx_to_rx(s->gen_tb->jmp_target_arg + which);
 +    return (uintptr_t)tcg_splitwx_to_rx(&s->gen_tb->jmp_target_addr[which]);
  }
--static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
+ /* Signal overflow, starting over with fewer guest insns. */
--{
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
--    tlb_table_flush_by_mmuidx(env, mmu_idx);
+     /* Initialize goto_tb jump offsets. */
--    env_tlb(env)->d[mmu_idx].large_page_addr = -1;
+     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
--    env_tlb(env)->d[mmu_idx].large_page_mask = -1;
+     tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
--    env_tlb(env)->d[mmu_idx].vindex = 0;
++    tb->jmp_insn_offset[0] = TB_JMP_OFFSET_INVALID;
--    memset(env_tlb(env)->d[mmu_idx].vtable, -1,
++    tb->jmp_insn_offset[1] = TB_JMP_OFFSET_INVALID;
--           sizeof(env_tlb(env)->d[0].vtable));
--}
+     tcg_reg_alloc_start(s);
--
  static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
  {
      CPUArchState *env = cpu->env_ptr;
 --
-.20.1
+.34.1

-[PULL 12/16] cputlb: Split out tlb_mmu_flush_locked
+[PULL 12/22] tcg: Change tb_target_set_jmp_target arguments
-We will want to be able to flush a tlb without resizing.
+Replace 'tc_ptr' and 'addr' with 'tb' and 'n'.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 15 ++++++++++-----
+ tcg/aarch64/tcg-target.h         |  3 ++-
-file changed, 10 insertions(+), 5 deletions(-)
+ tcg/arm/tcg-target.h             |  3 ++-
  tcg/i386/tcg-target.h            |  9 ++-------
  tcg/loongarch64/tcg-target.h     |  3 ++-
  tcg/mips/tcg-target.h            |  3 ++-
  tcg/ppc/tcg-target.h             |  3 ++-
  tcg/riscv/tcg-target.h           |  3 ++-
  tcg/s390x/tcg-target.h           | 10 ++--------
  tcg/sparc64/tcg-target.h         |  3 ++-
  tcg/tci/tcg-target.h             |  3 ++-
  accel/tcg/cpu-exec.c             | 11 ++++++++---
  tcg/aarch64/tcg-target.c.inc     |  5 +++--
  tcg/i386/tcg-target.c.inc        |  9 +++++++++
  tcg/loongarch64/tcg-target.c.inc |  5 +++--
  tcg/ppc/tcg-target.c.inc         |  7 ++++---
  tcg/s390x/tcg-target.c.inc       | 10 ++++++++++
  tcg/sparc64/tcg-target.c.inc     |  7 ++++---
 files changed, 61 insertions(+), 36 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/tcg/aarch64/tcg-target.h
-+++ b/accel/tcg/cputlb.c
++++ b/tcg/aarch64/tcg-target.h
-@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast)
+@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_HAS_MEMORY_BSWAP     0
 -void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 +void tb_target_set_jmp_target(const TranslationBlock *, int,
 +                              uintptr_t, uintptr_t);
  #define TCG_TARGET_NEED_LDST_LABELS
  #define TCG_TARGET_NEED_POOL_LABELS
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
  #define TCG_TARGET_HAS_MEMORY_BSWAP     0
  /* not defined -- call should be eliminated at compile time */
 -void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t, uintptr_t);
  #define TCG_TARGET_NEED_LDST_LABELS
  #define TCG_TARGET_NEED_POOL_LABELS
 diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.h
 +++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
  #define TCG_TARGET_extract_i64_valid(ofs, len) \
      (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
 -static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
 -                                            uintptr_t jmp_rw, uintptr_t addr)
 -{
 -    /* patch the branch destination */
 -    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
 -    /* no need to flush icache explicitly */
 -}
 +void tb_target_set_jmp_target(const TranslationBlock *, int,
 +                              uintptr_t, uintptr_t);
  /* This defines the natural memory order supported by this
   * architecture before guarantees made by various barrier
 diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.h
 +++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_muluh_i64        1
  #define TCG_TARGET_HAS_mulsh_i64        1
 -void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t, uintptr_t);
  #define TCG_TARGET_DEFAULT_MO (0)
 diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.h
 +++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
  /* not defined -- call should be eliminated at compile time */
 -void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t)
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t, uintptr_t)
      QEMU_ERROR("code path is reachable");
  #define TCG_TARGET_NEED_LDST_LABELS
 diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.h
 +++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
  #define TCG_TARGET_HAS_bitsel_vec       have_vsx
  #define TCG_TARGET_HAS_cmpsel_vec       0
 -void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t, uintptr_t);
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.h
 +++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #endif
  /* not defined -- call should be eliminated at compile time */
 -void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t, uintptr_t);
  #define TCG_TARGET_DEFAULT_MO (0)
 diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.h
 +++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
  #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
 -static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
 -                                            uintptr_t jmp_rw, uintptr_t addr)
 -{
 -    /* patch the branch destination */
 -    intptr_t disp = addr - (jmp_rx - 2);
 -    qatomic_set((int32_t *)jmp_rw, disp / 2);
 -    /* no need to flush icache explicitly */
 -}
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw);
  #define TCG_TARGET_NEED_LDST_LABELS
  #define TCG_TARGET_NEED_POOL_LABELS
 diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.h
 +++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 -void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t, uintptr_t);
  #define TCG_TARGET_NEED_POOL_LABELS
 diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.h
 +++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
  /* not defined -- call should be eliminated at compile time */
 -void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t, uintptr_t);
  #endif /* TCG_TARGET_H */
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
  {
      tb->jmp_target_addr[n] = addr;
      if (TCG_TARGET_HAS_direct_jump) {
 +        /*
 +         * Get the rx view of the structure, from which we find the
 +         * executable code address, and tb_target_set_jmp_target can
 +         * produce a pc-relative displacement to jmp_target_addr[n].
 +         */
 +        const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
          uintptr_t offset = tb->jmp_insn_offset[n];
 -        uintptr_t tc_ptr = (uintptr_t)tb->tc.ptr;
 -        uintptr_t jmp_rx = tc_ptr + offset;
 +        uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
          uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
 -        tb_target_set_jmp_target(tc_ptr, jmp_rx, jmp_rw, addr);
 +        tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
      }
  }
--static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
-+static void tlb_mmu_flush_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast)
+index XXXXXXX..XXXXXXX 100644
- {
+--- a/tcg/aarch64/tcg-target.c.inc
--    CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
++++ b/tcg/aarch64/tcg-target.c.inc
--    CPUTLBDescFast *fast = &env_tlb(env)->f[mmu_idx];
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
--
+     tcg_out_call_int(s, target);
--    tlb_mmu_resize_locked(desc, fast);
+ }
-     desc->n_used_entries = 0;
-     desc->large_page_addr = -1;
+-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-     desc->large_page_mask = -1;
+-                              uintptr_t jmp_rw, uintptr_t addr)
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-     memset(desc->vtable, -1, sizeof(desc->vtable));
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
- }
+ {
++    uintptr_t addr = tb->jmp_target_addr[n];
-+static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
+     tcg_insn_unit i1, i2;
      TCGType rt = TCG_TYPE_I64;
      TCGReg  rd = TCG_REG_TMP;
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
      set_jmp_reset_offset(s, which);
  }
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
-+    CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
++    /* patch the branch destination */
-+    CPUTLBDescFast *fast = &env_tlb(env)->f[mmu_idx];
++    uintptr_t addr = tb->jmp_target_addr[n];
-+
++    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
-+    tlb_mmu_resize_locked(desc, fast);
++    /* no need to flush icache explicitly */
 +    tlb_mmu_flush_locked(desc, fast);
 +}
 +
- static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
+ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
- {
+                               const TCGArg args[TCG_MAX_OP_ARGS],
-     env_tlb(env)->d[mmu_idx].n_used_entries++;
+                               const int const_args[TCG_MAX_OP_ARGS])
 diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.c.inc
 +++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop(TCGContext *s)
      tcg_out32(s, NOP);
  }
 -void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
 -                              uintptr_t jmp_rw, uintptr_t addr)
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
  {
      tcg_insn_unit i1, i2;
      ptrdiff_t upper, lower;
 +    uintptr_t addr = tb->jmp_target_addr[n];
      ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
      if (offset == sextreg(offset, 0, 26)) {
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
      flush_idcache_range(rx, rw, 16);
  }
 -void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
 -                              uintptr_t jmp_rw, uintptr_t addr)
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
  {
      tcg_insn_unit i0, i1, i2, i3;
 -    intptr_t tb_diff = addr - tc_ptr;
 +    uintptr_t addr = tb->jmp_target_addr[n];
 +    intptr_t tb_diff = addr - (uintptr_t)tb->tc.ptr;
      intptr_t br_diff = addr - (jmp_rx + 4);
      intptr_t lo, hi;
 diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.c.inc
 +++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
      set_jmp_reset_offset(s, which);
  }
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
 +    /* patch the branch destination */
 +    uintptr_t addr = tb->jmp_target_addr[n];
 +    intptr_t disp = addr - (jmp_rx - 2);
 +    qatomic_set((int32_t *)jmp_rw, disp / 2);
 +    /* no need to flush icache explicitly */
 +}
 +
  # define OP_32_64(x) \
          case glue(glue(INDEX_op_,x),_i32): \
          case glue(glue(INDEX_op_,x),_i64)
 diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.c.inc
 +++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tcg_register_jit(const void *buf, size_t buf_size)
      tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
  }
 -void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
 -                              uintptr_t jmp_rw, uintptr_t addr)
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
  {
 -    intptr_t tb_disp = addr - tc_ptr;
 +    uintptr_t addr = tb->jmp_target_addr[n];
 +    intptr_t tb_disp = addr - (uintptr_t)tb->tc.ptr;
      intptr_t br_disp = addr - jmp_rx;
      tcg_insn_unit i1, i2;
 --
-.20.1
+.34.1

-[PULL 02/16] util/cacheinfo: fix crash when compiling with uClibc
+[PULL 13/22] tcg: Move tb_target_set_jmp_target declaration to tcg.h
-From: Carlos Santos <casantos@redhat.com>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 uClibc defines _SC_LEVEL1_ICACHE_LINESIZE and _SC_LEVEL1_DCACHE_LINESIZE
 but the corresponding sysconf calls returns -1, which is a valid result,
 meaning that the limit is indeterminate.
 Handle this situation using the fallback values instead of crashing due
 to an assertion failure.
 Signed-off-by: Carlos Santos <casantos@redhat.com>
 Message-Id: <20191017123713.30192-1-casantos@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- util/cacheinfo.c | 10 ++++++++--
+ include/tcg/tcg.h            | 3 +++
-file changed, 8 insertions(+), 2 deletions(-)
+ tcg/aarch64/tcg-target.h     | 4 ----
  tcg/arm/tcg-target.h         | 5 -----
  tcg/i386/tcg-target.h        | 3 ---
  tcg/loongarch64/tcg-target.h | 3 ---
  tcg/mips/tcg-target.h        | 5 -----
  tcg/ppc/tcg-target.h         | 4 ----
  tcg/riscv/tcg-target.h       | 4 ----
  tcg/s390x/tcg-target.h       | 4 ----
  tcg/sparc64/tcg-target.h     | 4 ----
  tcg/tci/tcg-target.h         | 4 ----
 files changed, 3 insertions(+), 40 deletions(-)
-diff --git a/util/cacheinfo.c b/util/cacheinfo.c
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
---- a/util/cacheinfo.c
+--- a/include/tcg/tcg.h
-+++ b/util/cacheinfo.c
++++ b/include/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@ static void sys_cache_info(int *isize, int *dsize)
+@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s);
- static void sys_cache_info(int *isize, int *dsize)
- {
+ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start);
- # ifdef _SC_LEVEL1_ICACHE_LINESIZE
--    *isize = sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
++void tb_target_set_jmp_target(const TranslationBlock *, int,
-+    int tmp_isize = (int) sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
++                              uintptr_t, uintptr_t);
-+    if (tmp_isize > 0) {
++
-+        *isize = tmp_isize;
+ void tcg_set_frame(TCGContext *s, TCGReg reg, intptr_t start, intptr_t size);
-+    }
- # endif
+ TCGTemp *tcg_global_mem_new_internal(TCGType, TCGv_ptr,
- # ifdef _SC_LEVEL1_DCACHE_LINESIZE
+diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
--    *dsize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+index XXXXXXX..XXXXXXX 100644
-+    int tmp_dsize = (int) sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+--- a/tcg/aarch64/tcg-target.h
-+    if (tmp_dsize > 0) {
++++ b/tcg/aarch64/tcg-target.h
-+        *dsize = tmp_dsize;
+@@ -XXX,XX +XXX,XX @@ typedef enum {
-+    }
- # endif
+ #define TCG_TARGET_DEFAULT_MO (0)
- }
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     0
- #endif /* sys_cache_info */
+-
 -void tb_target_set_jmp_target(const TranslationBlock *, int,
 -                              uintptr_t, uintptr_t);
 -
  #define TCG_TARGET_NEED_LDST_LABELS
  #define TCG_TARGET_NEED_POOL_LABELS
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_HAS_MEMORY_BSWAP     0
 -
 -/* not defined -- call should be eliminated at compile time */
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t, uintptr_t);
 -
  #define TCG_TARGET_NEED_LDST_LABELS
  #define TCG_TARGET_NEED_POOL_LABELS
 diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.h
 +++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
  #define TCG_TARGET_extract_i64_valid(ofs, len) \
      (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
 -void tb_target_set_jmp_target(const TranslationBlock *, int,
 -                              uintptr_t, uintptr_t);
 -
  /* This defines the natural memory order supported by this
   * architecture before guarantees made by various barrier
   * instructions.
 diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.h
 +++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_muluh_i64        1
  #define TCG_TARGET_HAS_mulsh_i64        1
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t, uintptr_t);
 -
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_NEED_LDST_LABELS
 diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.h
 +++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 -/* not defined -- call should be eliminated at compile time */
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t, uintptr_t)
 -    QEMU_ERROR("code path is reachable");
 -
  #define TCG_TARGET_NEED_LDST_LABELS
  #endif
 diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.h
 +++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
  #define TCG_TARGET_HAS_bitsel_vec       have_vsx
  #define TCG_TARGET_HAS_cmpsel_vec       0
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t, uintptr_t);
 -
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 -
  #define TCG_TARGET_NEED_LDST_LABELS
  #define TCG_TARGET_NEED_POOL_LABELS
 diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.h
 +++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_mulsh_i64        1
  #endif
 -/* not defined -- call should be eliminated at compile time */
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t, uintptr_t);
 -
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_NEED_LDST_LABELS
 diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.h
 +++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
  #define TCG_TARGET_HAS_MEMORY_BSWAP   1
  #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
 -
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t jmp_rx, uintptr_t jmp_rw);
 -
  #define TCG_TARGET_NEED_LDST_LABELS
  #define TCG_TARGET_NEED_POOL_LABELS
 diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.h
 +++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 -
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t, uintptr_t);
 -
  #define TCG_TARGET_NEED_POOL_LABELS
  #endif
 diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.h
 +++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 -/* not defined -- call should be eliminated at compile time */
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t, uintptr_t);
 -
  #endif /* TCG_TARGET_H */
 --
-.20.1
+.34.1

-[PULL 15/16] cputlb: Hoist timestamp outside of loops over tlbs
+[PULL 14/22] tcg: Always define tb_target_set_jmp_target
-Do not call get_clock_realtime() in tlb_mmu_resize_locked,
+Install empty versions for !TCG_TARGET_HAS_direct_jump hosts.
 but hoist outside of any loop over a set of tlbs.  This is
 only two (indirect) callers, tlb_flush_by_mmuidx_async_work
 and tlb_flush_page_locked, so not onerous.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 14 ++++++++------
+ tcg/arm/tcg-target.c.inc   | 6 ++++++
-file changed, 8 insertions(+), 6 deletions(-)
+ tcg/mips/tcg-target.c.inc  | 6 ++++++
  tcg/riscv/tcg-target.c.inc | 6 ++++++
  tcg/tci/tcg-target.c.inc   | 6 ++++++
 files changed, 24 insertions(+)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/tcg/arm/tcg-target.c.inc
-+++ b/accel/tcg/cputlb.c
++++ b/tcg/arm/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
-  * high), since otherwise we are likely to have a significant amount of
+     set_jmp_reset_offset(s, which);
   * conflict misses.
   */
 -static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast)
 +static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast,
 +                                  int64_t now)
  {
      size_t old_size = tlb_n_entries(fast);
      size_t rate;
      size_t new_size = old_size;
 -    int64_t now = get_clock_realtime();
      int64_t window_len_ms = 100;
      int64_t window_len_ns = window_len_ms * 1000 * 1000;
      bool window_expired = now > desc->window_begin_ns + window_len_ns;
@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_flush_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast)
      memset(desc->vtable, -1, sizeof(desc->vtable));
  }
--static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-+static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-+                                        int64_t now)
++{
- {
++    /* Always indirect, nothing to do */
-     CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
++}
-     CPUTLBDescFast *fast = &env_tlb(env)->f[mmu_idx];
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
--    tlb_mmu_resize_locked(desc, fast);
+                        const TCGArg args[TCG_MAX_OP_ARGS],
-+    tlb_mmu_resize_locked(desc, fast, now);
+                        const int const_args[TCG_MAX_OP_ARGS])
-     tlb_mmu_flush_locked(desc, fast);
+diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
      set_jmp_reset_offset(s, which);
  }
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-     CPUArchState *env = cpu->env_ptr;
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-     uint16_t asked = data.host_int;
++{
-     uint16_t all_dirty, work, to_clean;
++    /* Always indirect, nothing to do */
-+    int64_t now = get_clock_realtime();
++}
++
-     assert_cpu_is_self(cpu);
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
+                        const int const_args[TCG_MAX_OP_ARGS])
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
-     for (work = to_clean; work != 0; work &= work - 1) {
+index XXXXXXX..XXXXXXX 100644
-         int mmu_idx = ctz32(work);
+--- a/tcg/riscv/tcg-target.c.inc
--        tlb_flush_one_mmuidx_locked(env, mmu_idx);
++++ b/tcg/riscv/tcg-target.c.inc
-+        tlb_flush_one_mmuidx_locked(env, mmu_idx, now);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
-     }
+     set_jmp_reset_offset(s, which);
+ }
-     qemu_spin_unlock(&env_tlb(env)->c.lock);
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_locked(CPUArchState *env, int midx,
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-         tlb_debug("forcing full flush midx %d ("
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-                   TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
++{
-                   midx, lp_addr, lp_mask);
++    /* Always indirect, nothing to do */
--        tlb_flush_one_mmuidx_locked(env, midx);
++}
-+        tlb_flush_one_mmuidx_locked(env, midx, get_clock_realtime());
++
-     } else {
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-         if (tlb_flush_entry_locked(tlb_entry(env, midx, page), page)) {
+                        const TCGArg args[TCG_MAX_OP_ARGS],
-             tlb_n_used_entries_dec(env, midx);
+                        const int const_args[TCG_MAX_OP_ARGS])
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
      set_jmp_reset_offset(s, which);
  }
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
 +    /* Always indirect, nothing to do */
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
 --
-.20.1
+.34.1

-New patch
+[PULL 15/22] tcg: Remove TCG_TARGET_HAS_direct_jump
+We now have the option to generate direct or indirect
 goto_tb depending on the dynamic displacement, thus
 the define is no longer necessary or completely accurate.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/aarch64/tcg-target.h     |  1 -
  tcg/arm/tcg-target.h         |  1 -
  tcg/i386/tcg-target.h        |  1 -
  tcg/loongarch64/tcg-target.h |  1 -
  tcg/mips/tcg-target.h        |  1 -
  tcg/ppc/tcg-target.h         |  1 -
  tcg/riscv/tcg-target.h       |  1 -
  tcg/s390x/tcg-target.h       |  1 -
  tcg/sparc64/tcg-target.h     |  1 -
  tcg/tci/tcg-target.h         |  1 -
  accel/tcg/cpu-exec.c         | 23 +++++++++++------------
  tcg/tcg.c                    |  1 -
  tcg/arm/tcg-target.c.inc     |  1 -
  tcg/mips/tcg-target.c.inc    |  1 -
  tcg/riscv/tcg-target.c.inc   |  1 -
  tcg/s390x/tcg-target.c.inc   |  3 +++
  tcg/tci/tcg-target.c.inc     |  1 -
 files changed, 14 insertions(+), 27 deletions(-)
 diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.h
 +++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_muls2_i64        0
  #define TCG_TARGET_HAS_muluh_i64        1
  #define TCG_TARGET_HAS_mulsh_i64        1
 -#define TCG_TARGET_HAS_direct_jump      1
  #define TCG_TARGET_HAS_v64              1
  #define TCG_TARGET_HAS_v128             1
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
  #define TCG_TARGET_HAS_mulsh_i32        0
  #define TCG_TARGET_HAS_div_i32          use_idiv_instructions
  #define TCG_TARGET_HAS_rem_i32          0
 -#define TCG_TARGET_HAS_direct_jump      0
  #define TCG_TARGET_HAS_qemu_st8_i32     0
  #define TCG_TARGET_HAS_v64              use_neon_instructions
 diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.h
 +++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
  #define TCG_TARGET_HAS_muls2_i32        1
  #define TCG_TARGET_HAS_muluh_i32        0
  #define TCG_TARGET_HAS_mulsh_i32        0
 -#define TCG_TARGET_HAS_direct_jump      1
  #if TCG_TARGET_REG_BITS == 64
  /* Keep target addresses zero-extended in a register.  */
 diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.h
 +++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_clz_i32          1
  #define TCG_TARGET_HAS_ctz_i32          1
  #define TCG_TARGET_HAS_ctpop_i32        0
 -#define TCG_TARGET_HAS_direct_jump      1
  #define TCG_TARGET_HAS_brcond2          0
  #define TCG_TARGET_HAS_setcond2         0
  #define TCG_TARGET_HAS_qemu_st8_i32     0
 diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.h
 +++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
  #define TCG_TARGET_HAS_muluh_i32        1
  #define TCG_TARGET_HAS_mulsh_i32        1
  #define TCG_TARGET_HAS_bswap32_i32      1
 -#define TCG_TARGET_HAS_direct_jump      0
  #if TCG_TARGET_REG_BITS == 64
  #define TCG_TARGET_HAS_add2_i32         0
 diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.h
 +++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
  #define TCG_TARGET_HAS_muls2_i32        0
  #define TCG_TARGET_HAS_muluh_i32        1
  #define TCG_TARGET_HAS_mulsh_i32        1
 -#define TCG_TARGET_HAS_direct_jump      1
  #define TCG_TARGET_HAS_qemu_st8_i32     0
  #if TCG_TARGET_REG_BITS == 64
 diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.h
 +++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_clz_i32          0
  #define TCG_TARGET_HAS_ctz_i32          0
  #define TCG_TARGET_HAS_ctpop_i32        0
 -#define TCG_TARGET_HAS_direct_jump      0
  #define TCG_TARGET_HAS_brcond2          1
  #define TCG_TARGET_HAS_setcond2         1
  #define TCG_TARGET_HAS_qemu_st8_i32     0
 diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.h
 +++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
  #define TCG_TARGET_HAS_mulsh_i32      0
  #define TCG_TARGET_HAS_extrl_i64_i32  0
  #define TCG_TARGET_HAS_extrh_i64_i32  0
 -#define TCG_TARGET_HAS_direct_jump    1
  #define TCG_TARGET_HAS_qemu_st8_i32   0
  #define TCG_TARGET_HAS_div2_i64       1
 diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.h
 +++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
  #define TCG_TARGET_HAS_muls2_i32        1
  #define TCG_TARGET_HAS_muluh_i32        0
  #define TCG_TARGET_HAS_mulsh_i32        0
 -#define TCG_TARGET_HAS_direct_jump      1
  #define TCG_TARGET_HAS_qemu_st8_i32     0
  #define TCG_TARGET_HAS_extrl_i64_i32    1
 diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.h
 +++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@
  #define TCG_TARGET_HAS_muls2_i32        1
  #define TCG_TARGET_HAS_muluh_i32        0
  #define TCG_TARGET_HAS_mulsh_i32        0
 -#define TCG_TARGET_HAS_direct_jump      0
  #define TCG_TARGET_HAS_qemu_st8_i32     0
  #if TCG_TARGET_REG_BITS == 64
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
  void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
  {
 +    /*
 +     * Get the rx view of the structure, from which we find the
 +     * executable code address, and tb_target_set_jmp_target can
 +     * produce a pc-relative displacement to jmp_target_addr[n].
 +     */
 +    const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
 +    uintptr_t offset = tb->jmp_insn_offset[n];
 +    uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
 +    uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
 +
      tb->jmp_target_addr[n] = addr;
 -    if (TCG_TARGET_HAS_direct_jump) {
 -        /*
 -         * Get the rx view of the structure, from which we find the
 -         * executable code address, and tb_target_set_jmp_target can
 -         * produce a pc-relative displacement to jmp_target_addr[n].
 -         */
 -        const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
 -        uintptr_t offset = tb->jmp_insn_offset[n];
 -        uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
 -        uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
 -        tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
 -    }
 +    tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
  }
  static inline void tb_add_jump(TranslationBlock *tb, int n,
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
       * We will check for overflow at the end of the opcode loop in
       * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
       */
 -    tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
      s->gen_tb->jmp_insn_offset[which] = tcg_current_code_size(s);
  }
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
      intptr_t ptr, dif, dil;
      TCGReg base = TCG_REG_PC;
 -    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
      ptr = get_jmp_target_addr(s, which);
      dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
      dil = sextract32(dif, 0, 12);
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
  static void tcg_out_goto_tb(TCGContext *s, int which)
  {
      /* indirect jump method */
 -    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
      tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
                 get_jmp_target_addr(s, which));
      tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
  static void tcg_out_goto_tb(TCGContext *s, int which)
  {
 -    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
      /* indirect jump method */
      tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
                 get_jmp_target_addr(s, which));
 diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.c.inc
 +++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
  void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
                                uintptr_t jmp_rx, uintptr_t jmp_rw)
  {
 +    if (!HAVE_FACILITY(GEN_INST_EXT)) {
 +        return;
 +    }
      /* patch the branch destination */
      uintptr_t addr = tb->jmp_target_addr[n];
      intptr_t disp = addr - (jmp_rx - 2);
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
  static void tcg_out_goto_tb(TCGContext *s, int which)
  {
 -    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
      /* indirect jump method. */
      tcg_out_op_p(s, INDEX_op_goto_tb, (void *)get_jmp_target_addr(s, which));
      set_jmp_reset_offset(s, which);
 --
 .34.1

-[PULL 13/16] cputlb: Partially merge tlb_dyn_init into tlb_init
+[PULL 16/22] tcg/aarch64: Reorg goto_tb implementation
-Merge into the only caller, but at the same time split
+The old implementation replaces two insns, swapping between
 out tlb_mmu_init to initialize a single tlb entry.
+    b    <dest>
+    nop
+    br    x30
+and
+    adrp    x30, <dest>
+    addi    x30, x30, lo12:<dest>
+    br    x30
+There is a race condition in which a thread could be stopped at
+the PC of the second insn, and when restarted does not see the
+complete address computation and branches to nowhere.
+The new implemetation replaces only one insn, swapping between
+    b    <dest>
+    br    tmp
+and
+    ldr    tmp, <jmp_addr>
+    br    tmp
+Reported-by: hev <r@hev.cc>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 33 ++++++++++++++++-----------------
+ tcg/aarch64/tcg-target.h     |  2 +-
-file changed, 16 insertions(+), 17 deletions(-)
+ tcg/aarch64/tcg-target.c.inc | 66 +++++++++++++++---------------------
 files changed, 29 insertions(+), 39 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/tcg/aarch64/tcg-target.h
-+++ b/accel/tcg/cputlb.c
++++ b/tcg/aarch64/tcg-target.h
-@@ -XXX,XX +XXX,XX @@ static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,
+@@ -XXX,XX +XXX,XX @@
-     desc->window_max_entries = max_entries;
  #define TCG_TARGET_INSN_UNIT_SIZE  4
  #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
 -#define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
 +#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
  typedef enum {
      TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
      tcg_out_call_int(s, target);
  }
--static void tlb_dyn_init(CPUArchState *env)
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 -{
--    int i;
+-    uintptr_t addr = tb->jmp_target_addr[n];
 -    tcg_insn_unit i1, i2;
 -    TCGType rt = TCG_TYPE_I64;
 -    TCGReg  rd = TCG_REG_TMP;
 -    uint64_t pair;
 -
--    for (i = 0; i < NB_MMU_MODES; i++) {
+-    ptrdiff_t offset = addr - jmp_rx;
 -        CPUTLBDesc *desc = &env_tlb(env)->d[i];
 -        size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS;
 -
--        tlb_window_reset(desc, get_clock_realtime(), 0);
+-    if (offset == sextract64(offset, 0, 26)) {
--        desc->n_used_entries = 0;
+-        i1 = I3206_B | ((offset >> 2) & 0x3ffffff);
--        env_tlb(env)->f[i].mask = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
+-        i2 = NOP;
--        env_tlb(env)->f[i].table = g_new(CPUTLBEntry, n_entries);
+-    } else {
--        env_tlb(env)->d[i].iotlb = g_new(CPUIOTLBEntry, n_entries);
+-        offset = (addr >> 12) - (jmp_rx >> 12);
 -
 -        /* patch ADRP */
 -        i1 = I3406_ADRP | (offset & 3) << 29 | (offset & 0x1ffffc) << (5 - 2) | rd;
 -        /* patch ADDI */
 -        i2 = I3401_ADDI | rt << 31 | (addr & 0xfff) << 10 | rd << 5 | rd;
 -    }
+-    pair = (uint64_t)i2 << 32 | i1;
+-    qatomic_set((uint64_t *)jmp_rw, pair);
+-    flush_idcache_range(jmp_rx, jmp_rw, 8);
 -}
 -
- /**
+ static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
-  * tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
+ {
-  * @desc: The CPUTLBDesc portion of the TLB
+     if (!l->has_value) {
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
-     tlb_mmu_flush_locked(desc, fast);
+ static void tcg_out_goto_tb(TCGContext *s, int which)
  {
      /*
 -     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
 -     * write can be used to patch the target address.
 +     * Direct branch, or indirect address load, will be patched
 +     * by tb_target_set_jmp_target.  Assert indirect load offset
 +     * in range early, regardless of direct branch distance.
       */
 -    if ((uintptr_t)s->code_ptr & 7) {
 -        tcg_out32(s, NOP);
 -    }
 +    intptr_t i_off = tcg_pcrel_diff(s, (void *)get_jmp_target_addr(s, which));
 +    tcg_debug_assert(i_off == sextract64(i_off, 0, 21));
 +
      set_jmp_insn_offset(s, which);
 -    /*
 -     * actual branch destination will be patched by
 -     * tb_target_set_jmp_target later
 -     */
 -    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
 -    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
 +    tcg_out32(s, I3206_B);
      tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
      set_jmp_reset_offset(s, which);
  }
-+static void tlb_mmu_init(CPUTLBDesc *desc, CPUTLBDescFast *fast, int64_t now)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
-+    size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS;
++    uintptr_t d_addr = tb->jmp_target_addr[n];
 +    ptrdiff_t d_offset = d_addr - jmp_rx;
 +    tcg_insn_unit insn;
 +
-+    tlb_window_reset(desc, now, 0);
++    /* Either directly branch, or indirect branch load. */
-+    desc->n_used_entries = 0;
++    if (d_offset == sextract64(d_offset, 0, 28)) {
-+    fast->mask = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
++        insn = deposit32(I3206_B, 0, 26, d_offset >> 2);
-+    fast->table = g_new(CPUTLBEntry, n_entries);
++    } else {
-+    desc->iotlb = g_new(CPUIOTLBEntry, n_entries);
++        uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
 +        ptrdiff_t i_offset = i_addr - jmp_rx;
 +
 +        /* Note that we asserted this in range in tcg_out_goto_tb. */
 +        insn = deposit32(I3305_LDR | TCG_REG_TMP, 0, 5, i_offset >> 2);
 +    }
 +    qatomic_set((uint32_t *)jmp_rw, insn);
 +    flush_idcache_range(jmp_rx, jmp_rw, 4);
 +}
 +
- static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
- {
+                        const TCGArg args[TCG_MAX_OP_ARGS],
-     env_tlb(env)->d[mmu_idx].n_used_entries++;
+                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static inline void tlb_n_used_entries_dec(CPUArchState *env, uintptr_t mmu_idx)
  void tlb_init(CPUState *cpu)
  {
      CPUArchState *env = cpu->env_ptr;
 +    int64_t now = get_clock_realtime();
 +    int i;
      qemu_spin_init(&env_tlb(env)->c.lock);
      /* Ensure that cpu_reset performs a full flush.  */
      env_tlb(env)->c.dirty = ALL_MMUIDX_BITS;
 -    tlb_dyn_init(env);
 +    for (i = 0; i < NB_MMU_MODES; i++) {
 +        tlb_mmu_init(&env_tlb(env)->d[i], &env_tlb(env)->f[i], now);
 +    }
  }
  /* flush_all_helper: run fn across all cpus
 --
-.20.1
+.34.1

-New patch
+[PULL 17/22] tcg/ppc: Reorg goto_tb implementation
+The old ppc64 implementation replaces 2 or 4 insns, which leaves a race
 condition in which a thread could be stopped at a PC in the middle of
 the sequence, and when restarted does not see the complete address
 computation and branches to nowhere.
 The new implemetation replaces only one insn, swapping between
     b       <dest>
 and
     mtctr    r31
 falling through to a general-case indirect branch.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/ppc/tcg-target.h     |   3 +-
  tcg/ppc/tcg-target.c.inc | 158 +++++++++++----------------------------
 files changed, 44 insertions(+), 117 deletions(-)
 diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.h
 +++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@
  #ifdef _ARCH_PPC64
  # define TCG_TARGET_REG_BITS  64
 -# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
  #else
  # define TCG_TARGET_REG_BITS  32
 -# define MAX_CODE_GEN_BUFFER_SIZE  (32 * MiB)
  #endif
 +#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
  #define TCG_TARGET_NB_REGS 64
  #define TCG_TARGET_INSN_UNIT_SIZE 4
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
      tcg_out32(s, insn);
  }
 -static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2)
 -{
 -    if (HOST_BIG_ENDIAN) {
 -        return (uint64_t)i1 << 32 | i2;
 -    }
 -    return (uint64_t)i2 << 32 | i1;
 -}
 -
 -static inline void ppc64_replace2(uintptr_t rx, uintptr_t rw,
 -                                  tcg_insn_unit i0, tcg_insn_unit i1)
 -{
 -#if TCG_TARGET_REG_BITS == 64
 -    qatomic_set((uint64_t *)rw, make_pair(i0, i1));
 -    flush_idcache_range(rx, rw, 8);
 -#else
 -    qemu_build_not_reached();
 -#endif
 -}
 -
 -static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
 -                                  tcg_insn_unit i0, tcg_insn_unit i1,
 -                                  tcg_insn_unit i2, tcg_insn_unit i3)
 -{
 -    uint64_t p[2];
 -
 -    p[!HOST_BIG_ENDIAN] = make_pair(i0, i1);
 -    p[HOST_BIG_ENDIAN] = make_pair(i2, i3);
 -
 -    /*
 -     * There's no convenient way to get the compiler to allocate a pair
 -     * of registers at an even index, so copy into r6/r7 and clobber.
 -     */
 -    asm("mr  %%r6, %1\n\t"
 -        "mr  %%r7, %2\n\t"
 -        "stq %%r6, %0"
 -        : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7");
 -    flush_idcache_range(rx, rw, 16);
 -}
 -
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 -{
 -    tcg_insn_unit i0, i1, i2, i3;
 -    uintptr_t addr = tb->jmp_target_addr[n];
 -    intptr_t tb_diff = addr - (uintptr_t)tb->tc.ptr;
 -    intptr_t br_diff = addr - (jmp_rx + 4);
 -    intptr_t lo, hi;
 -
 -    if (TCG_TARGET_REG_BITS == 32) {
 -        intptr_t diff = addr - jmp_rx;
 -        tcg_debug_assert(in_range_b(diff));
 -        qatomic_set((uint32_t *)jmp_rw, B | (diff & 0x3fffffc));
 -        flush_idcache_range(jmp_rx, jmp_rw, 4);
 -        return;
 -    }
 -
 -    /*
 -     * For 16-bit displacements, we can use a single add + branch.
 -     * This happens quite often.
 -     */
 -    if (tb_diff == (int16_t)tb_diff) {
 -        i0 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
 -        i1 = B | (br_diff & 0x3fffffc);
 -        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
 -        return;
 -    }
 -
 -    lo = (int16_t)tb_diff;
 -    hi = (int32_t)(tb_diff - lo);
 -    assert(tb_diff == hi + lo);
 -    i0 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
 -    i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
 -
 -    /*
 -     * Without stq from 2.07, we can only update two insns,
 -     * and those must be the ones that load the target address.
 -     */
 -    if (!have_isa_2_07) {
 -        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
 -        return;
 -    }
 -
 -    /*
 -     * For 26-bit displacements, we can use a direct branch.
 -     * Otherwise we still need the indirect branch, which we
 -     * must restore after a potential direct branch write.
 -     */
 -    br_diff -= 4;
 -    if (in_range_b(br_diff)) {
 -        i2 = B | (br_diff & 0x3fffffc);
 -        i3 = NOP;
 -    } else {
 -        i2 = MTSPR | RS(TCG_REG_TB) | CTR;
 -        i3 = BCCTR | BO_ALWAYS;
 -    }
 -    ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3);
 -}
 -
  static void tcg_out_call_int(TCGContext *s, int lk,
                               const tcg_insn_unit *target)
  {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
  static void tcg_out_goto_tb(TCGContext *s, int which)
  {
 -    /* Direct jump. */
 -    if (TCG_TARGET_REG_BITS == 64) {
 -        /* Ensure the next insns are 8 or 16-byte aligned. */
 -        while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
 -            tcg_out32(s, NOP);
 -        }
 +    uintptr_t ptr = get_jmp_target_addr(s, which);
 +
 +    if (USE_REG_TB) {
 +        ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
 +        tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
 +
 +        /* Direct branch will be patched by tb_target_set_jmp_target. */
          set_jmp_insn_offset(s, which);
 -        tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
 -        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
          tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
 +
 +        /* When branch is out of range, fall through to indirect. */
 +        tcg_out32(s, BCCTR | BO_ALWAYS);
 +
 +        /* For the unlinked case, need to reset TCG_REG_TB.  */
 +        set_jmp_reset_offset(s, which);
 +        tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
 +                         -tcg_current_code_size(s));
 +    } else {
 +        /* Direct branch will be patched by tb_target_set_jmp_target. */
 +        set_jmp_insn_offset(s, which);
 +        tcg_out32(s, NOP);
 +
 +        /* When branch is out of range, fall through to indirect. */
 +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
 +        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
 +        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
          tcg_out32(s, BCCTR | BO_ALWAYS);
          set_jmp_reset_offset(s, which);
 -        if (USE_REG_TB) {
 -            /* For the unlinked case, need to reset TCG_REG_TB.  */
 -            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
 -                             -tcg_current_code_size(s));
 -        }
 -    } else {
 -        set_jmp_insn_offset(s, which);
 -        tcg_out32(s, B);
 -        set_jmp_reset_offset(s, which);
      }
  }
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
 +    uintptr_t addr = tb->jmp_target_addr[n];
 +    intptr_t diff = addr - jmp_rx;
 +    tcg_insn_unit insn;
 +
 +    if (in_range_b(diff)) {
 +        insn = B | (diff & 0x3fffffc);
 +    } else if (USE_REG_TB) {
 +        insn = MTSPR | RS(TCG_REG_TB) | CTR;
 +    } else {
 +        insn = NOP;
 +    }
 +
 +    qatomic_set((uint32_t *)jmp_rw, insn);
 +    flush_idcache_range(jmp_rx, jmp_rw, 4);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
 --
 .34.1

-[PULL 10/16] cputlb: Hoist tlb portions in tlb_mmu_resize_locked
+[PULL 18/22] tcg/sparc64: Remove USE_REG_TB
-No functional change, but the smaller expressions make
+This is always true for sparc64, so this is dead since 3a5f6805c7ca.
 the code easier to read.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 35 +++++++++++++++++------------------
+ tcg/sparc64/tcg-target.c.inc | 62 ++++++++++++------------------------
-file changed, 17 insertions(+), 18 deletions(-)
+file changed, 21 insertions(+), 41 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/tcg/sparc64/tcg-target.c.inc
-+++ b/accel/tcg/cputlb.c
++++ b/tcg/sparc64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void tlb_dyn_init(CPUArchState *env)
+@@ -XXX,XX +XXX,XX @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
+ #endif
- /**
-  * tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
+ #define TCG_REG_TB  TCG_REG_I1
-- * @env: CPU that owns the TLB
+-#define USE_REG_TB  (sizeof(void *) > 4)
-- * @mmu_idx: MMU index of the TLB
-+ * @desc: The CPUTLBDesc portion of the TLB
+ static const int tcg_target_reg_alloc_order[] = {
-+ * @fast: The CPUTLBDescFast portion of the same TLB
+     TCG_REG_L0,
-  *
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
-  * Called with tlb_lock_held.
+     }
-  *
-@@ -XXX,XX +XXX,XX @@ static void tlb_dyn_init(CPUArchState *env)
+     /* A 13-bit constant relative to the TB.  */
-  * high), since otherwise we are likely to have a significant amount of
+-    if (!in_prologue && USE_REG_TB) {
-  * conflict misses.
++    if (!in_prologue) {
-  */
+         test = tcg_tbrel_diff(s, (void *)arg);
--static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
+         if (check_fit_ptr(test, 13)) {
-+static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast)
+             tcg_out_arithi(s, ret, TCG_REG_TB, test, ARITH_ADD);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
      }
      /* Use the constant pool, if possible. */
 -    if (!in_prologue && USE_REG_TB) {
 +    if (!in_prologue) {
          new_pool_label(s, arg, R_SPARC_13, s->code_ptr,
                         tcg_tbrel_diff(s, NULL));
          tcg_out32(s, LDX | INSN_RD(ret) | INSN_RS1(TCG_REG_TB));
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
  #endif
      /* We choose TCG_REG_TB such that no move is required.  */
 -    if (USE_REG_TB) {
 -        QEMU_BUILD_BUG_ON(TCG_REG_TB != TCG_REG_I1);
 -        tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);
 -    }
 +    QEMU_BUILD_BUG_ON(TCG_REG_TB != TCG_REG_I1);
 +    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);
      tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I1, 0, JMPL);
      /* delay slot */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
          tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
          tcg_out_movi_imm13(s, TCG_REG_O0, a0);
          return;
 -    } else if (USE_REG_TB) {
 +    } else {
          intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
          if (check_fit_ptr(tb_diff, 13)) {
              tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
  static void tcg_out_goto_tb(TCGContext *s, int which)
  {
--    CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
++    int c;
--    size_t old_size = tlb_n_entries(&env_tlb(env)->f[mmu_idx]);
++
-+    size_t old_size = tlb_n_entries(fast);
+     /* Direct jump. */
-     size_t rate;
+-    if (USE_REG_TB) {
-     size_t new_size = old_size;
+-        /* make sure the patch is 8-byte aligned.  */
-     int64_t now = get_clock_realtime();
+-        if ((intptr_t)s->code_ptr & 4) {
-@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
+-            tcg_out_nop(s);
-         return;
+-        }
 -        set_jmp_insn_offset(s, which);
 -        tcg_out_sethi(s, TCG_REG_T1, 0);
 -        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
 -        tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
 -        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
 -    } else {
 -        set_jmp_insn_offset(s, which);
 -        tcg_out32(s, CALL);
 +    /* make sure the patch is 8-byte aligned.  */
 +    if ((intptr_t)s->code_ptr & 4) {
          tcg_out_nop(s);
      }
++    set_jmp_insn_offset(s, which);
--    g_free(env_tlb(env)->f[mmu_idx].table);
++    tcg_out_sethi(s, TCG_REG_T1, 0);
--    g_free(env_tlb(env)->d[mmu_idx].iotlb);
++    tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-+    g_free(fast->table);
++    tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-+    g_free(desc->iotlb);
++    tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+     set_jmp_reset_offset(s, which);
-     tlb_window_reset(desc, now, 0);
      /* desc->n_used_entries is cleared by the caller */
 -    env_tlb(env)->f[mmu_idx].mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
 -    env_tlb(env)->f[mmu_idx].table = g_try_new(CPUTLBEntry, new_size);
 -    env_tlb(env)->d[mmu_idx].iotlb = g_try_new(CPUIOTLBEntry, new_size);
 +    fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
 +    fast->table = g_try_new(CPUTLBEntry, new_size);
 +    desc->iotlb = g_try_new(CPUIOTLBEntry, new_size);
 +
      /*
-      * If the allocations fail, try smaller sizes. We just freed some
+      * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
-      * memory, so going back to half of new_size has a good chance of working.
+      * to the beginning of this TB.
@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
       * allocations to fail though, so we progressively reduce the allocation
       * size, aborting if we cannot even allocate the smallest TLB we support.
       */
--    while (env_tlb(env)->f[mmu_idx].table == NULL ||
+-    if (USE_REG_TB) {
--           env_tlb(env)->d[mmu_idx].iotlb == NULL) {
+-        int c = -tcg_current_code_size(s);
-+    while (fast->table == NULL || desc->iotlb == NULL) {
+-        if (check_fit_i32(c, 13)) {
-         if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) {
+-            tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
-             error_report("%s: %s", __func__, strerror(errno));
+-        } else {
-             abort();
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
-         }
+-            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-         new_size = MAX(new_size >> 1, 1 << CPU_TLB_DYN_MIN_BITS);
+-        }
--        env_tlb(env)->f[mmu_idx].mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
++    c = -tcg_current_code_size(s);
-+        fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
++    if (check_fit_i32(c, 13)) {
++        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
--        g_free(env_tlb(env)->f[mmu_idx].table);
++    } else {
--        g_free(env_tlb(env)->d[mmu_idx].iotlb);
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
--        env_tlb(env)->f[mmu_idx].table = g_try_new(CPUTLBEntry, new_size);
++        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
 -        env_tlb(env)->d[mmu_idx].iotlb = g_try_new(CPUIOTLBEntry, new_size);
 +        g_free(fast->table);
 +        g_free(desc->iotlb);
 +        fast->table = g_try_new(CPUTLBEntry, new_size);
 +        desc->iotlb = g_try_new(CPUIOTLBEntry, new_size);
      }
  }
- static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
- {
+     switch (opc) {
--    tlb_mmu_resize_locked(env, mmu_idx);
+     case INDEX_op_goto_ptr:
-+    tlb_mmu_resize_locked(&env_tlb(env)->d[mmu_idx], &env_tlb(env)->f[mmu_idx]);
+         tcg_out_arithi(s, TCG_REG_G0, a0, 0, JMPL);
-     env_tlb(env)->d[mmu_idx].n_used_entries = 0;
+-        if (USE_REG_TB) {
-     env_tlb(env)->d[mmu_idx].large_page_addr = -1;
+-            tcg_out_mov_delay(s, TCG_REG_TB, a0);
-     env_tlb(env)->d[mmu_idx].large_page_mask = -1;
+-        } else {
 -            tcg_out_nop(s);
 -        }
 +        tcg_out_mov_delay(s, TCG_REG_TB, a0);
          break;
      case INDEX_op_br:
          tcg_out_bpcc(s, COND_A, BPCC_PT, arg_label(a0));
@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
      tcg_debug_assert(tb_disp == (int32_t)tb_disp);
      tcg_debug_assert(br_disp == (int32_t)br_disp);
 -    if (!USE_REG_TB) {
 -        qatomic_set((uint32_t *)jmp_rw,
 -            deposit32(CALL, 0, 30, br_disp >> 2));
 -        flush_idcache_range(jmp_rx, jmp_rw, 4);
 -        return;
 -    }
 -
      /* This does not exercise the range of the branch, but we do
         still need to be able to load the new value of TCG_REG_TB.
         But this does still happen quite often.  */
 --
-.20.1
+.34.1

-[PULL 08/16] cputlb: Make tlb_n_entries private to cputlb.c
+[PULL 19/22] tcg/sparc64: Reorg goto_tb implementation
-There are no users of this function outside cputlb.c,
+The old sparc64 implementation may replace two insns, which leaves
-and its interface will change in the next patch.
+a race condition in which a thread could be stopped at a PC in the
 middle of the sequence, and when restarted does not see the complete
 address computation and branches to nowhere.
 The new implemetation replaces only one insn, swapping between a
 direct branch and a direct call.  The TCG_REG_TB register is loaded
 from tb->jmp_target_addr[] in the delay slot.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu_ldst.h | 5 -----
+ tcg/sparc64/tcg-target.c.inc | 87 +++++++++++++++---------------------
- accel/tcg/cputlb.c      | 5 +++++
+file changed, 37 insertions(+), 50 deletions(-)
 files changed, 5 insertions(+), 5 deletions(-)
-diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst.h
+--- a/tcg/sparc64/tcg-target.c.inc
-+++ b/include/exec/cpu_ldst.h
++++ b/tcg/sparc64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
-     return (addr >> TARGET_PAGE_BITS) & size_mask;
  static void tcg_out_goto_tb(TCGContext *s, int which)
  {
 -    int c;
 +    ptrdiff_t off = tcg_tbrel_diff(s, (void *)get_jmp_target_addr(s, which));
 -    /* Direct jump. */
 -    /* make sure the patch is 8-byte aligned.  */
 -    if ((intptr_t)s->code_ptr & 4) {
 -        tcg_out_nop(s);
 -    }
 +    /* Direct branch will be patched by tb_target_set_jmp_target. */
      set_jmp_insn_offset(s, which);
 -    tcg_out_sethi(s, TCG_REG_T1, 0);
 -    tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
 -    tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
 -    tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
 +    tcg_out32(s, CALL);
 +    /* delay slot */
 +    tcg_debug_assert(check_fit_ptr(off, 13));
 +    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, TCG_REG_TB, off);
      set_jmp_reset_offset(s, which);
      /*
       * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
       * to the beginning of this TB.
       */
 -    c = -tcg_current_code_size(s);
 -    if (check_fit_i32(c, 13)) {
 -        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
 +    off = -tcg_current_code_size(s);
 +    if (check_fit_i32(off, 13)) {
 +        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, off, ARITH_ADD);
      } else {
 -        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
 +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, off);
          tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
      }
  }
--static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
--{
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 -    return (env_tlb(env)->f[mmu_idx].mask >> CPU_TLB_ENTRY_BITS) + 1;
 -}
 -
  /* Find the TLB entry corresponding to the mmu_idx + address pair.  */
  static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
                                       target_ulong addr)
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
  QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
  #define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
 +static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx)
 +{
-+    return (env_tlb(env)->f[mmu_idx].mask >> CPU_TLB_ENTRY_BITS) + 1;
++    uintptr_t addr = tb->jmp_target_addr[n];
 +    intptr_t br_disp = (intptr_t)(addr - jmp_rx) >> 2;
 +    tcg_insn_unit insn;
 +
 +    br_disp >>= 2;
 +    if (check_fit_ptr(br_disp, 19)) {
 +        /* ba,pt %icc, addr */
 +        insn = deposit32(INSN_OP(0) | INSN_OP2(1) | INSN_COND(COND_A)
 +                         | BPCC_ICC | BPCC_PT, 0, 19, br_disp);
 +    } else if (check_fit_ptr(br_disp, 22)) {
 +        /* ba addr */
 +        insn = deposit32(INSN_OP(0) | INSN_OP2(2) | INSN_COND(COND_A),
 +                         0, 22, br_disp);
 +    } else {
 +        /* The code_gen_buffer can't be larger than 2GB.  */
 +        tcg_debug_assert(check_fit_ptr(br_disp, 30));
 +        /* call addr */
 +        insn = deposit32(CALL, 0, 30, br_disp);
 +    }
 +
 +    qatomic_set((uint32_t *)jmp_rw, insn);
 +    flush_idcache_range(jmp_rx, jmp_rw, 4);
 +}
 +
- static inline size_t sizeof_tlb(CPUArchState *env, uintptr_t mmu_idx)
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ void tcg_register_jit(const void *buf, size_t buf_size)
  {
-     return env_tlb(env)->f[mmu_idx].mask + (1 << CPU_TLB_ENTRY_BITS);
+     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
  }
 -
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 -{
 -    uintptr_t addr = tb->jmp_target_addr[n];
 -    intptr_t tb_disp = addr - (uintptr_t)tb->tc.ptr;
 -    intptr_t br_disp = addr - jmp_rx;
 -    tcg_insn_unit i1, i2;
 -
 -    /* We can reach the entire address space for ILP32.
 -       For LP64, the code_gen_buffer can't be larger than 2GB.  */
 -    tcg_debug_assert(tb_disp == (int32_t)tb_disp);
 -    tcg_debug_assert(br_disp == (int32_t)br_disp);
 -
 -    /* This does not exercise the range of the branch, but we do
 -       still need to be able to load the new value of TCG_REG_TB.
 -       But this does still happen quite often.  */
 -    if (check_fit_ptr(tb_disp, 13)) {
 -        /* ba,pt %icc, addr */
 -        i1 = (INSN_OP(0) | INSN_OP2(1) | INSN_COND(COND_A)
 -              | BPCC_ICC | BPCC_PT | INSN_OFF19(br_disp));
 -        i2 = (ARITH_ADD | INSN_RD(TCG_REG_TB) | INSN_RS1(TCG_REG_TB)
 -              | INSN_IMM13(tb_disp));
 -    } else if (tb_disp >= 0) {
 -        i1 = SETHI | INSN_RD(TCG_REG_T1) | ((tb_disp & 0xfffffc00) >> 10);
 -        i2 = (ARITH_OR | INSN_RD(TCG_REG_T1) | INSN_RS1(TCG_REG_T1)
 -              | INSN_IMM13(tb_disp & 0x3ff));
 -    } else {
 -        i1 = SETHI | INSN_RD(TCG_REG_T1) | ((~tb_disp & 0xfffffc00) >> 10);
 -        i2 = (ARITH_XOR | INSN_RD(TCG_REG_T1) | INSN_RS1(TCG_REG_T1)
 -              | INSN_IMM13((tb_disp & 0x3ff) | -0x400));
 -    }
 -
 -    qatomic_set((uint64_t *)jmp_rw, deposit64(i2, 32, 32, i1));
 -    flush_idcache_range(jmp_rx, jmp_rw, 8);
 -}
 --
-.20.1
+.34.1

-[PULL 09/16] cputlb: Pass CPUTLBDescFast to tlb_n_entries and sizeof_tlb
+[PULL 20/22] tcg/arm: Implement direct branch for goto_tb
-We do not need the entire CPUArchState to compute these values.
+Now that tcg can handle direct and indirect goto_tb
 simultaneously, we can optimistically leave space for
 a direct branch and fall back to loading the pointer
 from the TB for an indirect branch.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 15 ++++++++-------
+ tcg/arm/tcg-target.c.inc | 52 ++++++++++++++++++++++++++++------------
-file changed, 8 insertions(+), 7 deletions(-)
+file changed, 37 insertions(+), 15 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/tcg/arm/tcg-target.c.inc
-+++ b/accel/tcg/cputlb.c
++++ b/tcg/arm/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
+@@ -XXX,XX +XXX,XX @@ typedef enum {
- QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
+     ARITH_BIC = 0xe << 21,
- #define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
+     ARITH_MVN = 0xf << 21,
--static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx)
++    INSN_B         = 0x0a000000,
-+static inline size_t tlb_n_entries(CPUTLBDescFast *fast)
++
      INSN_CLZ       = 0x016f0f10,
      INSN_RBIT      = 0x06ff0f30,
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
  static void tcg_out_b_imm(TCGContext *s, ARMCond cond, int32_t offset)
  {
--    return (env_tlb(env)->f[mmu_idx].mask >> CPU_TLB_ENTRY_BITS) + 1;
+-    tcg_out32(s, (cond << 28) | 0x0a000000 |
-+    return (fast->mask >> CPU_TLB_ENTRY_BITS) + 1;
++    tcg_out32(s, (cond << 28) | INSN_B |
                      (((offset - 8) >> 2) & 0x00ffffff));
  }
--static inline size_t sizeof_tlb(CPUArchState *env, uintptr_t mmu_idx)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
-+static inline size_t sizeof_tlb(CPUTLBDescFast *fast)
  static void tcg_out_goto_tb(TCGContext *s, int which)
  {
--    return env_tlb(env)->f[mmu_idx].mask + (1 << CPU_TLB_ENTRY_BITS);
+-    /* Indirect jump method */
-+    return fast->mask + (1 << CPU_TLB_ENTRY_BITS);
+-    intptr_t ptr, dif, dil;
 -    TCGReg base = TCG_REG_PC;
 +    uintptr_t i_addr;
 +    intptr_t i_disp;
 -    ptr = get_jmp_target_addr(s, which);
 -    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
 -    dil = sextract32(dif, 0, 12);
 -    if (dif != dil) {
 +    /* Direct branch will be patched by tb_target_set_jmp_target. */
 +    set_jmp_insn_offset(s, which);
 +    tcg_out32(s, INSN_NOP);
 +
 +    /* When branch is out of range, fall through to indirect. */
 +    i_addr = get_jmp_target_addr(s, which);
 +    i_disp = tcg_pcrel_diff(s, (void *)i_addr) - 8;
 +    tcg_debug_assert(i_disp < 0);
 +    if (i_disp >= -0xfff) {
 +        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, i_disp);
 +    } else {
          /*
           * The TB is close, but outside the 12 bits addressable by
           * the load.  We can extend this to 20 bits with a sub of a
 -         * shifted immediate from pc.  In the vastly unlikely event
 -         * the code requires more than 1MB, we'll use 2 insns and
 -         * be no worse off.
 +         * shifted immediate from pc.
           */
 -        base = TCG_REG_R0;
 -        tcg_out_movi32(s, COND_AL, base, ptr - dil);
 +        int h = -i_disp;
 +        int l = h & 0xfff;
 +
 +        h = encode_imm_nofail(h - l);
 +        tcg_out_dat_imm(s, COND_AL, ARITH_SUB, TCG_REG_R0, TCG_REG_PC, h);
 +        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, l);
      }
 -    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
      set_jmp_reset_offset(s, which);
  }
- static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,
+ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-@@ -XXX,XX +XXX,XX @@ static void tlb_dyn_init(CPUArchState *env)
+                               uintptr_t jmp_rx, uintptr_t jmp_rw)
  static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
  {
-     CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
+-    /* Always indirect, nothing to do */
--    size_t old_size = tlb_n_entries(env, mmu_idx);
++    uintptr_t addr = tb->jmp_target_addr[n];
-+    size_t old_size = tlb_n_entries(&env_tlb(env)->f[mmu_idx]);
++    ptrdiff_t offset = addr - (jmp_rx + 8);
-     size_t rate;
++    tcg_insn_unit insn;
-     size_t new_size = old_size;
++
-     int64_t now = get_clock_realtime();
++    /* Either directly branch, or fall through to indirect branch. */
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
++    if (offset == sextract64(offset, 0, 26)) {
-     env_tlb(env)->d[mmu_idx].large_page_addr = -1;
++        /* B <addr> */
-     env_tlb(env)->d[mmu_idx].large_page_mask = -1;
++        insn = deposit32((COND_AL << 28) | INSN_B, 0, 24, offset >> 2);
-     env_tlb(env)->d[mmu_idx].vindex = 0;
++    } else {
--    memset(env_tlb(env)->f[mmu_idx].table, -1, sizeof_tlb(env, mmu_idx));
++        insn = INSN_NOP;
-+    memset(env_tlb(env)->f[mmu_idx].table, -1,
++    }
-+           sizeof_tlb(&env_tlb(env)->f[mmu_idx]));
++
-     memset(env_tlb(env)->d[mmu_idx].vtable, -1,
++    qatomic_set((uint32_t *)jmp_rw, insn);
-            sizeof(env_tlb(env)->d[0].vtable));
++    flush_idcache_range(jmp_rx, jmp_rw, 4);
  }
-@@ -XXX,XX +XXX,XX @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
-     qemu_spin_lock(&env_tlb(env)->c.lock);
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
          unsigned int i;
 -        unsigned int n = tlb_n_entries(env, mmu_idx);
 +        unsigned int n = tlb_n_entries(&env_tlb(env)->f[mmu_idx]);
          for (i = 0; i < n; i++) {
              tlb_reset_dirty_range_locked(&env_tlb(env)->f[mmu_idx].table[i],
 --
-.20.1
+.34.1

-[PULL 04/16] vl: Reduce scope of variables in configure_accelerators
+[PULL 21/22] tcg/riscv: Introduce OPC_NOP
-The accel_list and tmp variables are only used when manufacturing
--machine accel, options based on -accel.
-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed by: Aleksandar Markovic <amarkovic@wavecomp.com>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- vl.c | 3 ++-
+ tcg/riscv/tcg-target.c.inc | 3 ++-
 file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/vl.c b/vl.c
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/vl.c
+--- a/tcg/riscv/tcg-target.c.inc
-+++ b/vl.c
++++ b/tcg/riscv/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static int do_configure_accelerator(void *opaque, QemuOpts *opts, Error **errp)
+@@ -XXX,XX +XXX,XX @@ typedef enum {
- static void configure_accelerators(const char *progname)
+ #endif
      OPC_FENCE = 0x0000000f,
 +    OPC_NOP   = OPC_ADDI,   /* nop = addi r0,r0,0 */
  } RISCVInsn;
  /*
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
  {
-     const char *accel;
+     int i;
--    char **accel_list, **tmp;
+     for (i = 0; i < count; ++i) {
-     bool init_failed = false;
+-        p[i] = encode_i(OPC_ADDI, TCG_REG_ZERO, TCG_REG_ZERO, 0);
++        p[i] = OPC_NOP;
-     qemu_opts_foreach(qemu_find_opts("icount"),
+     }
-@@ -XXX,XX +XXX,XX @@ static void configure_accelerators(const char *progname)
+ }
      accel = qemu_opt_get(qemu_get_machine_opts(), "accel");
      if (QTAILQ_EMPTY(&qemu_accel_opts.head)) {
 +        char **accel_list, **tmp;
 +
          if (accel == NULL) {
              /* Select the default accelerator */
              if (!accel_find("tcg") && !accel_find("kvm")) {
 --
-.20.1
+.34.1

-[PULL 11/16] cputlb: Hoist tlb portions in tlb_flush_one_mmuidx_locked
+[PULL 22/22] tcg/riscv: Implement direct branch for goto_tb
-No functional change, but the smaller expressions make
+Now that tcg can handle direct and indirect goto_tb simultaneously,
-the code easier to read.
+we can optimistically leave space for a direct branch and fall back
 to loading the pointer from the TB for an indirect branch.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 19 ++++++++++---------
+ tcg/riscv/tcg-target.c.inc | 19 +++++++++++++++++--
-file changed, 10 insertions(+), 9 deletions(-)
+file changed, 17 insertions(+), 2 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/tcg/riscv/tcg-target.c.inc
-+++ b/accel/tcg/cputlb.c
++++ b/tcg/riscv/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
- static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
+ static void tcg_out_goto_tb(TCGContext *s, int which)
  {
--    tlb_mmu_resize_locked(&env_tlb(env)->d[mmu_idx], &env_tlb(env)->f[mmu_idx]);
+-    /* indirect jump method */
--    env_tlb(env)->d[mmu_idx].n_used_entries = 0;
++    /* Direct branch will be patched by tb_target_set_jmp_target. */
--    env_tlb(env)->d[mmu_idx].large_page_addr = -1;
++    set_jmp_insn_offset(s, which);
--    env_tlb(env)->d[mmu_idx].large_page_mask = -1;
++    tcg_out32(s, OPC_JAL);
 -    env_tlb(env)->d[mmu_idx].vindex = 0;
 -    memset(env_tlb(env)->f[mmu_idx].table, -1,
 -           sizeof_tlb(&env_tlb(env)->f[mmu_idx]));
 -    memset(env_tlb(env)->d[mmu_idx].vtable, -1,
 -           sizeof(env_tlb(env)->d[0].vtable));
 +    CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
 +    CPUTLBDescFast *fast = &env_tlb(env)->f[mmu_idx];
 +
-+    tlb_mmu_resize_locked(desc, fast);
++    /* When branch is out of range, fall through to indirect. */
-+    desc->n_used_entries = 0;
+     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
-+    desc->large_page_addr = -1;
+                get_jmp_target_addr(s, which));
-+    desc->large_page_mask = -1;
+     tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
-+    desc->vindex = 0;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
-+    memset(fast->table, -1, sizeof_tlb(fast));
+ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-+    memset(desc->vtable, -1, sizeof(desc->vtable));
+                               uintptr_t jmp_rx, uintptr_t jmp_rw)
  {
 -    /* Always indirect, nothing to do */
 +    uintptr_t addr = tb->jmp_target_addr[n];
 +    ptrdiff_t offset = addr - jmp_rx;
 +    tcg_insn_unit insn;
 +
 +    /* Either directly branch, or fall through to indirect branch. */
 +    if (offset == sextreg(offset, 0, 20)) {
 +        insn = encode_uj(OPC_JAL, TCG_REG_ZERO, offset);
 +    } else {
 +        insn = OPC_NOP;
 +    }
 +    qatomic_set((uint32_t *)jmp_rw, insn);
 +    flush_idcache_range(jmp_rx, jmp_rw, 4);
  }
- static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 --
-.20.1
+.34.1

The following changes since commit 3e08b2b9cb64bff2b73fa9128c0e49bfcde0dd40:

Merge remote-tracking branch 'remotes/philmd-gitlab/tags/edk2-next-20200121' into staging (2020-01-21 15:29:25 +0000)

are available in the Git repository at:

https://github.com/rth7680/qemu.git tags/pull-tcg-20200121

for you to fetch changes up to 75fa376cdab5e5db2c7fdd107358e16f95503ac6:

scripts/git.orderfile: Display decodetree before C source (2020-01-21 15:26:09 -1000)

----------------------------------------------------------------
Remove another limit to NB_MMU_MODES.
Fix compilation using uclibc.
Fix defaulting of -accel parameters.
Tidy cputlb basic routines.
Adjust git.orderfile for decodetree.

----------------------------------------------------------------
Carlos Santos (1):
      util/cacheinfo: fix crash when compiling with uClibc

Philippe Mathieu-Daudé (1):
      scripts/git.orderfile: Display decodetree before C source

Richard Henderson (14):
      cputlb: Handle NB_MMU_MODES > TARGET_PAGE_BITS_MIN
      vl: Remove unused variable in configure_accelerators
      vl: Reduce scope of variables in configure_accelerators
      vl: Remove useless test in configure_accelerators
      vl: Only choose enabled accelerators in configure_accelerators
      cputlb: Merge tlb_table_flush_by_mmuidx into tlb_flush_one_mmuidx_locked
      cputlb: Make tlb_n_entries private to cputlb.c
      cputlb: Pass CPUTLBDescFast to tlb_n_entries and sizeof_tlb
      cputlb: Hoist tlb portions in tlb_mmu_resize_locked
      cputlb: Hoist tlb portions in tlb_flush_one_mmuidx_locked
      cputlb: Split out tlb_mmu_flush_locked
      cputlb: Partially merge tlb_dyn_init into tlb_init
      cputlb: Initialize tlbs as flushed
      cputlb: Hoist timestamp outside of loops over tlbs

In target/arm we will shortly have "too many" mmu_idx.
The current minimum barrier is caused by the way in which
tlb_flush_page_by_mmuidx is coded.

We can remove this limitation by allocating memory for
consumption by the worker.  Let us assume that this is
the unlikely case, as will be the case for the majority
of targets which have so far satisfied the BUILD_BUG_ON,
and only allocate memory when necessary.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 167 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 132 insertions(+), 35 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_locked(CPUArchState *env, int midx,
     }
 }
 
-/* As we are going to hijack the bottom bits of the page address for a
- * mmuidx bit mask we need to fail to build if we can't do that
+/**
+ * tlb_flush_page_by_mmuidx_async_0:
+ * @cpu: cpu on which to flush
+ * @addr: page of virtual address to flush
+ * @idxmap: set of mmu_idx to flush
+ *
+ * Helper for tlb_flush_page_by_mmuidx and friends, flush one page
+ * at @addr from the tlbs indicated by @idxmap from @cpu.
  */
-QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS_MIN);
-
-static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
-                                                run_on_cpu_data data)
+static void tlb_flush_page_by_mmuidx_async_0(CPUState *cpu,
+                                             target_ulong addr,
+                                             uint16_t idxmap)
 {
     CPUArchState *env = cpu->env_ptr;
-    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
-    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
-    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
     int mmu_idx;
 
     assert_cpu_is_self(cpu);
 
-    tlb_debug("page addr:" TARGET_FMT_lx " mmu_map:0x%lx\n",
-              addr, mmu_idx_bitmap);
+    tlb_debug("page addr:" TARGET_FMT_lx " mmu_map:0x%x\n", addr, idxmap);
 
     qemu_spin_lock(&env_tlb(env)->c.lock);
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
+        if ((idxmap >> mmu_idx) & 1) {
             tlb_flush_page_locked(env, mmu_idx, addr);
         }
     }
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
     tb_flush_jmp_cache(cpu, addr);
 }
 
+/**
+ * tlb_flush_page_by_mmuidx_async_1:
+ * @cpu: cpu on which to flush
+ * @data: encoded addr + idxmap
+ *
+ * Helper for tlb_flush_page_by_mmuidx and friends, called through
+ * async_run_on_cpu.  The idxmap parameter is encoded in the page
+ * offset of the target_ptr field.  This limits the set of mmu_idx
+ * that can be passed via this method.
+ */
+static void tlb_flush_page_by_mmuidx_async_1(CPUState *cpu,
+                                             run_on_cpu_data data)
+{
+    target_ulong addr_and_idxmap = (target_ulong) data.target_ptr;
+    target_ulong addr = addr_and_idxmap & TARGET_PAGE_MASK;
+    uint16_t idxmap = addr_and_idxmap & ~TARGET_PAGE_MASK;
+
+    tlb_flush_page_by_mmuidx_async_0(cpu, addr, idxmap);
+}
+
+typedef struct {
+    target_ulong addr;
+    uint16_t idxmap;
+} TLBFlushPageByMMUIdxData;
+
+/**
+ * tlb_flush_page_by_mmuidx_async_2:
+ * @cpu: cpu on which to flush
+ * @data: allocated addr + idxmap
+ *
+ * Helper for tlb_flush_page_by_mmuidx and friends, called through
+ * async_run_on_cpu.  The addr+idxmap parameters are stored in a
+ * TLBFlushPageByMMUIdxData structure that has been allocated
+ * specifically for this helper.  Free the structure when done.
+ */
+static void tlb_flush_page_by_mmuidx_async_2(CPUState *cpu,
+                                             run_on_cpu_data data)
+{
+    TLBFlushPageByMMUIdxData *d = data.host_ptr;
+
+    tlb_flush_page_by_mmuidx_async_0(cpu, d->addr, d->idxmap);
+    g_free(d);
+}
+
 void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
 {
-    target_ulong addr_and_mmu_idx;
-
     tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%" PRIx16 "\n", addr, idxmap);
 
     /* This should already be page aligned */
-    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
-    addr_and_mmu_idx |= idxmap;
+    addr &= TARGET_PAGE_MASK;
 
-    if (!qemu_cpu_is_self(cpu)) {
-        async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_work,
-                         RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+    if (qemu_cpu_is_self(cpu)) {
+        tlb_flush_page_by_mmuidx_async_0(cpu, addr, idxmap);
+    } else if (idxmap < TARGET_PAGE_SIZE) {
+        /*
+         * Most targets have only a few mmu_idx.  In the case where
+         * we can stuff idxmap into the low TARGET_PAGE_BITS, avoid
+         * allocating memory for this operation.
+         */
+        async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_1,
+                         RUN_ON_CPU_TARGET_PTR(addr | idxmap));
     } else {
-        tlb_flush_page_by_mmuidx_async_work(
-            cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+        TLBFlushPageByMMUIdxData *d = g_new(TLBFlushPageByMMUIdxData, 1);
+
+        /* Otherwise allocate a structure, freed by the worker.  */
+        d->addr = addr;
+        d->idxmap = idxmap;
+        async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_2,
+                         RUN_ON_CPU_HOST_PTR(d));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
 void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, target_ulong addr,
                                        uint16_t idxmap)
 {
-    const run_on_cpu_func fn = tlb_flush_page_by_mmuidx_async_work;
-    target_ulong addr_and_mmu_idx;
-
     tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);
 
     /* This should already be page aligned */
-    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
-    addr_and_mmu_idx |= idxmap;
+    addr &= TARGET_PAGE_MASK;
 
-    flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
-    fn(src_cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+    /*
+     * Allocate memory to hold addr+idxmap only when needed.
+     * See tlb_flush_page_by_mmuidx for details.
+     */
+    if (idxmap < TARGET_PAGE_SIZE) {
+        flush_all_helper(src_cpu, tlb_flush_page_by_mmuidx_async_1,
+                         RUN_ON_CPU_TARGET_PTR(addr | idxmap));
+    } else {
+        CPUState *dst_cpu;
+
+        /* Allocate a separate data block for each destination cpu.  */
+        CPU_FOREACH(dst_cpu) {
+            if (dst_cpu != src_cpu) {
+                TLBFlushPageByMMUIdxData *d
+                    = g_new(TLBFlushPageByMMUIdxData, 1);
+
+                d->addr = addr;
+                d->idxmap = idxmap;
+                async_run_on_cpu(dst_cpu, tlb_flush_page_by_mmuidx_async_2,
+                                 RUN_ON_CPU_HOST_PTR(d));
+            }
+        }
+    }
+
+    tlb_flush_page_by_mmuidx_async_0(src_cpu, addr, idxmap);
 }
 
 void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr)
@@ -XXX,XX +XXX,XX @@ void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
                                               target_ulong addr,
                                               uint16_t idxmap)
 {
-    const run_on_cpu_func fn = tlb_flush_page_by_mmuidx_async_work;
-    target_ulong addr_and_mmu_idx;
-
     tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);
 
     /* This should already be page aligned */
-    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
-    addr_and_mmu_idx |= idxmap;
+    addr &= TARGET_PAGE_MASK;
 
-    flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
-    async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+    /*
+     * Allocate memory to hold addr+idxmap only when needed.
+     * See tlb_flush_page_by_mmuidx for details.
+     */
+    if (idxmap < TARGET_PAGE_SIZE) {
+        flush_all_helper(src_cpu, tlb_flush_page_by_mmuidx_async_1,
+                         RUN_ON_CPU_TARGET_PTR(addr | idxmap));
+        async_safe_run_on_cpu(src_cpu, tlb_flush_page_by_mmuidx_async_1,
+                              RUN_ON_CPU_TARGET_PTR(addr | idxmap));
+    } else {
+        CPUState *dst_cpu;
+        TLBFlushPageByMMUIdxData *d;
+
+        /* Allocate a separate data block for each destination cpu.  */
+        CPU_FOREACH(dst_cpu) {
+            if (dst_cpu != src_cpu) {
+                d = g_new(TLBFlushPageByMMUIdxData, 1);
+                d->addr = addr;
+                d->idxmap = idxmap;
+                async_run_on_cpu(dst_cpu, tlb_flush_page_by_mmuidx_async_2,
+                                 RUN_ON_CPU_HOST_PTR(d));
+            }
+        }
+
+        d = g_new(TLBFlushPageByMMUIdxData, 1);
+        d->addr = addr;
+        d->idxmap = idxmap;
+        async_safe_run_on_cpu(src_cpu, tlb_flush_page_by_mmuidx_async_2,
+                              RUN_ON_CPU_HOST_PTR(d));
+    }
 }
 
 void tlb_flush_page_all_cpus_synced(CPUState *src, target_ulong addr)
-- 
2.20.1

From: Carlos Santos <casantos@redhat.com>

uClibc defines _SC_LEVEL1_ICACHE_LINESIZE and _SC_LEVEL1_DCACHE_LINESIZE
but the corresponding sysconf calls returns -1, which is a valid result,
meaning that the limit is indeterminate.

Handle this situation using the fallback values instead of crashing due
to an assertion failure.

Signed-off-by: Carlos Santos <casantos@redhat.com>
Message-Id: <20191017123713.30192-1-casantos@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 util/cacheinfo.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/util/cacheinfo.c b/util/cacheinfo.c
index XXXXXXX..XXXXXXX 100644
--- a/util/cacheinfo.c
+++ b/util/cacheinfo.c
@@ -XXX,XX +XXX,XX @@ static void sys_cache_info(int *isize, int *dsize)
 static void sys_cache_info(int *isize, int *dsize)
 {
 # ifdef _SC_LEVEL1_ICACHE_LINESIZE
-    *isize = sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
+    int tmp_isize = (int) sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
+    if (tmp_isize > 0) {
+        *isize = tmp_isize;
+    }
 # endif
 # ifdef _SC_LEVEL1_DCACHE_LINESIZE
-    *dsize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+    int tmp_dsize = (int) sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+    if (tmp_dsize > 0) {
+        *dsize = tmp_dsize;
+    }
 # endif
 }
 #endif /* sys_cache_info */
-- 
2.20.1

The accel_initialised variable no longer has any setters.

Fixes: 6f6e1698a68c
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed by: Aleksandar Markovic <amarkovic@wavecomp.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 vl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vl.c b/vl.c
index XXXXXXX..XXXXXXX 100644
--- a/vl.c
+++ b/vl.c
@@ -XXX,XX +XXX,XX @@ static void configure_accelerators(const char *progname)
 {
     const char *accel;
     char **accel_list, **tmp;
-    bool accel_initialised = false;
     bool init_failed = false;
 
     qemu_opts_foreach(qemu_find_opts("icount"),
@@ -XXX,XX +XXX,XX @@ static void configure_accelerators(const char *progname)
 
         accel_list = g_strsplit(accel, ":", 0);
 
-        for (tmp = accel_list; !accel_initialised && tmp && *tmp; tmp++) {
+        for (tmp = accel_list; tmp && *tmp; tmp++) {
             /*
              * Filter invalid accelerators here, to prevent obscenities
              * such as "-machine accel=tcg,,thread=single".
-- 
2.20.1

The accel_list and tmp variables are only used when manufacturing
-machine accel, options based on -accel.

Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed by: Aleksandar Markovic <amarkovic@wavecomp.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 vl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vl.c b/vl.c
index XXXXXXX..XXXXXXX 100644
--- a/vl.c
+++ b/vl.c
@@ -XXX,XX +XXX,XX @@ static int do_configure_accelerator(void *opaque, QemuOpts *opts, Error **errp)
 static void configure_accelerators(const char *progname)
 {
     const char *accel;
-    char **accel_list, **tmp;
     bool init_failed = false;
 
     qemu_opts_foreach(qemu_find_opts("icount"),
@@ -XXX,XX +XXX,XX @@ static void configure_accelerators(const char *progname)
 
     accel = qemu_opt_get(qemu_get_machine_opts(), "accel");
     if (QTAILQ_EMPTY(&qemu_accel_opts.head)) {
+        char **accel_list, **tmp;
+
         if (accel == NULL) {
             /* Select the default accelerator */
             if (!accel_find("tcg") && !accel_find("kvm")) {
-- 
2.20.1

By choosing "tcg:kvm" when kvm is not enabled, we generate
an incorrect warning: "invalid accelerator kvm".

At the same time, use g_str_has_suffix rather than open-coding
the same operation.

Presumably the inverse is also true with --disable-tcg.

Fixes: 28a0961757fc
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed by: Aleksandar Markovic <amarkovic@wavecomp.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 vl.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/vl.c b/vl.c
index XXXXXXX..XXXXXXX 100644
--- a/vl.c
+++ b/vl.c
@@ -XXX,XX +XXX,XX @@ static void configure_accelerators(const char *progname)
 
         if (accel == NULL) {
             /* Select the default accelerator */
-            if (!accel_find("tcg") && !accel_find("kvm")) {
-                error_report("No accelerator selected and"
-                             " no default accelerator available");
-                exit(1);
-            } else {
-                int pnlen = strlen(progname);
-                if (pnlen >= 3 && g_str_equal(&progname[pnlen - 3], "kvm")) {
+            bool have_tcg = accel_find("tcg");
+            bool have_kvm = accel_find("kvm");
+
+            if (have_tcg && have_kvm) {
+                if (g_str_has_suffix(progname, "kvm")) {
                     /* If the program name ends with "kvm", we prefer KVM */
                     accel = "kvm:tcg";
                 } else {
                     accel = "tcg:kvm";
                 }
+            } else if (have_kvm) {
+                accel = "kvm";
+            } else if (have_tcg) {
+                accel = "tcg";
+            } else {
+                error_report("No accelerator selected and"
+                             " no default accelerator available");
+                exit(1);
             }
         }
-
         accel_list = g_strsplit(accel, ":", 0);
 
         for (tmp = accel_list; *tmp; tmp++) {
-- 
2.20.1

There is only one caller for tlb_table_flush_by_mmuidx.  Place
the result at the earlier line number, due to an expected user
in the near future.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
     }
 }
 
-static inline void tlb_table_flush_by_mmuidx(CPUArchState *env, int mmu_idx)
+static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
 {
     tlb_mmu_resize_locked(env, mmu_idx);
-    memset(env_tlb(env)->f[mmu_idx].table, -1, sizeof_tlb(env, mmu_idx));
     env_tlb(env)->d[mmu_idx].n_used_entries = 0;
+    env_tlb(env)->d[mmu_idx].large_page_addr = -1;
+    env_tlb(env)->d[mmu_idx].large_page_mask = -1;
+    env_tlb(env)->d[mmu_idx].vindex = 0;
+    memset(env_tlb(env)->f[mmu_idx].table, -1, sizeof_tlb(env, mmu_idx));
+    memset(env_tlb(env)->d[mmu_idx].vtable, -1,
+           sizeof(env_tlb(env)->d[0].vtable));
 }
 
 static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
@@ -XXX,XX +XXX,XX @@ void tlb_flush_counts(size_t *pfull, size_t *ppart, size_t *pelide)
     *pelide = elide;
 }
 
-static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
-{
-    tlb_table_flush_by_mmuidx(env, mmu_idx);
-    env_tlb(env)->d[mmu_idx].large_page_addr = -1;
-    env_tlb(env)->d[mmu_idx].large_page_mask = -1;
-    env_tlb(env)->d[mmu_idx].vindex = 0;
-    memset(env_tlb(env)->d[mmu_idx].vtable, -1,
-           sizeof(env_tlb(env)->d[0].vtable));
-}
-
 static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 {
     CPUArchState *env = cpu->env_ptr;
-- 
2.20.1

There are no users of this function outside cputlb.c,
and its interface will change in the next patch.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h | 5 -----
 accel/tcg/cputlb.c      | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx,
     return (addr >> TARGET_PAGE_BITS) & size_mask;
 }
 
-static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx)
-{
-    return (env_tlb(env)->f[mmu_idx].mask >> CPU_TLB_ENTRY_BITS) + 1;
-}
-
 /* Find the TLB entry corresponding to the mmu_idx + address pair.  */
 static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
                                      target_ulong addr)
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
 QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
 #define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
 
+static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx)
+{
+    return (env_tlb(env)->f[mmu_idx].mask >> CPU_TLB_ENTRY_BITS) + 1;
+}
+
 static inline size_t sizeof_tlb(CPUArchState *env, uintptr_t mmu_idx)
 {
     return env_tlb(env)->f[mmu_idx].mask + (1 << CPU_TLB_ENTRY_BITS);
-- 
2.20.1

We do not need the entire CPUArchState to compute these values.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
 QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
 #define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
 
-static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx)
+static inline size_t tlb_n_entries(CPUTLBDescFast *fast)
 {
-    return (env_tlb(env)->f[mmu_idx].mask >> CPU_TLB_ENTRY_BITS) + 1;
+    return (fast->mask >> CPU_TLB_ENTRY_BITS) + 1;
 }
 
-static inline size_t sizeof_tlb(CPUArchState *env, uintptr_t mmu_idx)
+static inline size_t sizeof_tlb(CPUTLBDescFast *fast)
 {
-    return env_tlb(env)->f[mmu_idx].mask + (1 << CPU_TLB_ENTRY_BITS);
+    return fast->mask + (1 << CPU_TLB_ENTRY_BITS);
 }
 
 static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,
@@ -XXX,XX +XXX,XX @@ static void tlb_dyn_init(CPUArchState *env)
 static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
 {
     CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
-    size_t old_size = tlb_n_entries(env, mmu_idx);
+    size_t old_size = tlb_n_entries(&env_tlb(env)->f[mmu_idx]);
     size_t rate;
     size_t new_size = old_size;
     int64_t now = get_clock_realtime();
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
     env_tlb(env)->d[mmu_idx].large_page_addr = -1;
     env_tlb(env)->d[mmu_idx].large_page_mask = -1;
     env_tlb(env)->d[mmu_idx].vindex = 0;
-    memset(env_tlb(env)->f[mmu_idx].table, -1, sizeof_tlb(env, mmu_idx));
+    memset(env_tlb(env)->f[mmu_idx].table, -1,
+           sizeof_tlb(&env_tlb(env)->f[mmu_idx]));
     memset(env_tlb(env)->d[mmu_idx].vtable, -1,
            sizeof(env_tlb(env)->d[0].vtable));
 }
@@ -XXX,XX +XXX,XX @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
     qemu_spin_lock(&env_tlb(env)->c.lock);
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
         unsigned int i;
-        unsigned int n = tlb_n_entries(env, mmu_idx);
+        unsigned int n = tlb_n_entries(&env_tlb(env)->f[mmu_idx]);
 
         for (i = 0; i < n; i++) {
             tlb_reset_dirty_range_locked(&env_tlb(env)->f[mmu_idx].table[i],
-- 
2.20.1

No functional change, but the smaller expressions make
the code easier to read.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_dyn_init(CPUArchState *env)
 
 /**
  * tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
- * @env: CPU that owns the TLB
- * @mmu_idx: MMU index of the TLB
+ * @desc: The CPUTLBDesc portion of the TLB
+ * @fast: The CPUTLBDescFast portion of the same TLB
  *
  * Called with tlb_lock_held.
  *
@@ -XXX,XX +XXX,XX @@ static void tlb_dyn_init(CPUArchState *env)
  * high), since otherwise we are likely to have a significant amount of
  * conflict misses.
  */
-static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
+static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast)
 {
-    CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
-    size_t old_size = tlb_n_entries(&env_tlb(env)->f[mmu_idx]);
+    size_t old_size = tlb_n_entries(fast);
     size_t rate;
     size_t new_size = old_size;
     int64_t now = get_clock_realtime();
@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
         return;
     }
 
-    g_free(env_tlb(env)->f[mmu_idx].table);
-    g_free(env_tlb(env)->d[mmu_idx].iotlb);
+    g_free(fast->table);
+    g_free(desc->iotlb);
 
     tlb_window_reset(desc, now, 0);
     /* desc->n_used_entries is cleared by the caller */
-    env_tlb(env)->f[mmu_idx].mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
-    env_tlb(env)->f[mmu_idx].table = g_try_new(CPUTLBEntry, new_size);
-    env_tlb(env)->d[mmu_idx].iotlb = g_try_new(CPUIOTLBEntry, new_size);
+    fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
+    fast->table = g_try_new(CPUTLBEntry, new_size);
+    desc->iotlb = g_try_new(CPUIOTLBEntry, new_size);
+
     /*
      * If the allocations fail, try smaller sizes. We just freed some
      * memory, so going back to half of new_size has a good chance of working.
@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
      * allocations to fail though, so we progressively reduce the allocation
      * size, aborting if we cannot even allocate the smallest TLB we support.
      */
-    while (env_tlb(env)->f[mmu_idx].table == NULL ||
-           env_tlb(env)->d[mmu_idx].iotlb == NULL) {
+    while (fast->table == NULL || desc->iotlb == NULL) {
         if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) {
             error_report("%s: %s", __func__, strerror(errno));
             abort();
         }
         new_size = MAX(new_size >> 1, 1 << CPU_TLB_DYN_MIN_BITS);
-        env_tlb(env)->f[mmu_idx].mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
+        fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
 
-        g_free(env_tlb(env)->f[mmu_idx].table);
-        g_free(env_tlb(env)->d[mmu_idx].iotlb);
-        env_tlb(env)->f[mmu_idx].table = g_try_new(CPUTLBEntry, new_size);
-        env_tlb(env)->d[mmu_idx].iotlb = g_try_new(CPUIOTLBEntry, new_size);
+        g_free(fast->table);
+        g_free(desc->iotlb);
+        fast->table = g_try_new(CPUTLBEntry, new_size);
+        desc->iotlb = g_try_new(CPUIOTLBEntry, new_size);
     }
 }
 
 static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
 {
-    tlb_mmu_resize_locked(env, mmu_idx);
+    tlb_mmu_resize_locked(&env_tlb(env)->d[mmu_idx], &env_tlb(env)->f[mmu_idx]);
     env_tlb(env)->d[mmu_idx].n_used_entries = 0;
     env_tlb(env)->d[mmu_idx].large_page_addr = -1;
     env_tlb(env)->d[mmu_idx].large_page_mask = -1;
-- 
2.20.1

No functional change, but the smaller expressions make
the code easier to read.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast)
 
 static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
 {
-    tlb_mmu_resize_locked(&env_tlb(env)->d[mmu_idx], &env_tlb(env)->f[mmu_idx]);
-    env_tlb(env)->d[mmu_idx].n_used_entries = 0;
-    env_tlb(env)->d[mmu_idx].large_page_addr = -1;
-    env_tlb(env)->d[mmu_idx].large_page_mask = -1;
-    env_tlb(env)->d[mmu_idx].vindex = 0;
-    memset(env_tlb(env)->f[mmu_idx].table, -1,
-           sizeof_tlb(&env_tlb(env)->f[mmu_idx]));
-    memset(env_tlb(env)->d[mmu_idx].vtable, -1,
-           sizeof(env_tlb(env)->d[0].vtable));
+    CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
+    CPUTLBDescFast *fast = &env_tlb(env)->f[mmu_idx];
+
+    tlb_mmu_resize_locked(desc, fast);
+    desc->n_used_entries = 0;
+    desc->large_page_addr = -1;
+    desc->large_page_mask = -1;
+    desc->vindex = 0;
+    memset(fast->table, -1, sizeof_tlb(fast));
+    memset(desc->vtable, -1, sizeof(desc->vtable));
 }
 
 static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
-- 
2.20.1

We will want to be able to flush a tlb without resizing.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast)
     }
 }
 
-static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
+static void tlb_mmu_flush_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast)
 {
-    CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
-    CPUTLBDescFast *fast = &env_tlb(env)->f[mmu_idx];
-
-    tlb_mmu_resize_locked(desc, fast);
     desc->n_used_entries = 0;
     desc->large_page_addr = -1;
     desc->large_page_mask = -1;
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
     memset(desc->vtable, -1, sizeof(desc->vtable));
 }
 
+static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
+{
+    CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
+    CPUTLBDescFast *fast = &env_tlb(env)->f[mmu_idx];
+
+    tlb_mmu_resize_locked(desc, fast);
+    tlb_mmu_flush_locked(desc, fast);
+}
+
 static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
 {
     env_tlb(env)->d[mmu_idx].n_used_entries++;
-- 
2.20.1

Merge into the only caller, but at the same time split
out tlb_mmu_init to initialize a single tlb entry.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,
     desc->window_max_entries = max_entries;
 }
 
-static void tlb_dyn_init(CPUArchState *env)
-{
-    int i;
-
-    for (i = 0; i < NB_MMU_MODES; i++) {
-        CPUTLBDesc *desc = &env_tlb(env)->d[i];
-        size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS;
-
-        tlb_window_reset(desc, get_clock_realtime(), 0);
-        desc->n_used_entries = 0;
-        env_tlb(env)->f[i].mask = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
-        env_tlb(env)->f[i].table = g_new(CPUTLBEntry, n_entries);
-        env_tlb(env)->d[i].iotlb = g_new(CPUIOTLBEntry, n_entries);
-    }
-}
-
 /**
  * tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
  * @desc: The CPUTLBDesc portion of the TLB
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
     tlb_mmu_flush_locked(desc, fast);
 }
 
+static void tlb_mmu_init(CPUTLBDesc *desc, CPUTLBDescFast *fast, int64_t now)
+{
+    size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS;
+
+    tlb_window_reset(desc, now, 0);
+    desc->n_used_entries = 0;
+    fast->mask = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
+    fast->table = g_new(CPUTLBEntry, n_entries);
+    desc->iotlb = g_new(CPUIOTLBEntry, n_entries);
+}
+
 static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
 {
     env_tlb(env)->d[mmu_idx].n_used_entries++;
@@ -XXX,XX +XXX,XX @@ static inline void tlb_n_used_entries_dec(CPUArchState *env, uintptr_t mmu_idx)
 void tlb_init(CPUState *cpu)
 {
     CPUArchState *env = cpu->env_ptr;
+    int64_t now = get_clock_realtime();
+    int i;
 
     qemu_spin_init(&env_tlb(env)->c.lock);
 
     /* Ensure that cpu_reset performs a full flush.  */
     env_tlb(env)->c.dirty = ALL_MMUIDX_BITS;
 
-    tlb_dyn_init(env);
+    for (i = 0; i < NB_MMU_MODES; i++) {
+        tlb_mmu_init(&env_tlb(env)->d[i], &env_tlb(env)->f[i], now);
+    }
 }
 
 /* flush_all_helper: run fn across all cpus
-- 
2.20.1

There's little point in leaving these data structures half initialized,
and relying on a flush to be done during reset.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_init(CPUTLBDesc *desc, CPUTLBDescFast *fast, int64_t now)
     fast->mask = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
     fast->table = g_new(CPUTLBEntry, n_entries);
     desc->iotlb = g_new(CPUIOTLBEntry, n_entries);
+    tlb_mmu_flush_locked(desc, fast);
 }
 
 static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
@@ -XXX,XX +XXX,XX @@ void tlb_init(CPUState *cpu)
 
     qemu_spin_init(&env_tlb(env)->c.lock);
 
-    /* Ensure that cpu_reset performs a full flush.  */
-    env_tlb(env)->c.dirty = ALL_MMUIDX_BITS;
+    /* All tlbs are initialized flushed. */
+    env_tlb(env)->c.dirty = 0;
 
     for (i = 0; i < NB_MMU_MODES; i++) {
         tlb_mmu_init(&env_tlb(env)->d[i], &env_tlb(env)->f[i], now);
-- 
2.20.1

Do not call get_clock_realtime() in tlb_mmu_resize_locked,
but hoist outside of any loop over a set of tlbs.  This is
only two (indirect) callers, tlb_flush_by_mmuidx_async_work
and tlb_flush_page_locked, so not onerous.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,
  * high), since otherwise we are likely to have a significant amount of
  * conflict misses.
  */
-static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast)
+static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast,
+                                  int64_t now)
 {
     size_t old_size = tlb_n_entries(fast);
     size_t rate;
     size_t new_size = old_size;
-    int64_t now = get_clock_realtime();
     int64_t window_len_ms = 100;
     int64_t window_len_ns = window_len_ms * 1000 * 1000;
     bool window_expired = now > desc->window_begin_ns + window_len_ns;
@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_flush_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast)
     memset(desc->vtable, -1, sizeof(desc->vtable));
 }
 
-static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
+static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx,
+                                        int64_t now)
 {
     CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
     CPUTLBDescFast *fast = &env_tlb(env)->f[mmu_idx];
 
-    tlb_mmu_resize_locked(desc, fast);
+    tlb_mmu_resize_locked(desc, fast, now);
     tlb_mmu_flush_locked(desc, fast);
 }
 
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
     CPUArchState *env = cpu->env_ptr;
     uint16_t asked = data.host_int;
     uint16_t all_dirty, work, to_clean;
+    int64_t now = get_clock_realtime();
 
     assert_cpu_is_self(cpu);
 
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 
     for (work = to_clean; work != 0; work &= work - 1) {
         int mmu_idx = ctz32(work);
-        tlb_flush_one_mmuidx_locked(env, mmu_idx);
+        tlb_flush_one_mmuidx_locked(env, mmu_idx, now);
     }
 
     qemu_spin_unlock(&env_tlb(env)->c.lock);
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_locked(CPUArchState *env, int midx,
         tlb_debug("forcing full flush midx %d ("
                   TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
                   midx, lp_addr, lp_mask);
-        tlb_flush_one_mmuidx_locked(env, midx);
+        tlb_flush_one_mmuidx_locked(env, midx, get_clock_realtime());
     } else {
         if (tlb_flush_entry_locked(tlb_entry(env, midx, page), page)) {
             tlb_n_used_entries_dec(env, midx);
-- 
2.20.1

Second pull for this week, since this set is large enough by itself.

The following changes since commit 7c9236d6d61f30583d5d860097d88dbf0fe487bf:

Merge tag 'pull-tcg-20230116' of https://gitlab.com/rth7680/qemu into staging (2023-01-17 10:24:16 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230117

for you to fetch changes up to 493c9b19a7fb7f387c4fcf57d3836504d5242bf5:

tcg/riscv: Implement direct branch for goto_tb (2023-01-17 22:36:17 +0000)

----------------------------------------------------------------
tcg: Fix race conditions in (most) goto_tb implementations

----------------------------------------------------------------
Richard Henderson (22):
      tcg: Split out tcg_out_exit_tb
      tcg/i386: Remove unused goto_tb code for indirect jump
      tcg/ppc: Remove unused goto_tb code for indirect jump
      tcg/sparc64: Remove unused goto_tb code for indirect jump
      tcg: Replace asserts on tcg_jmp_insn_offset
      tcg: Introduce set_jmp_insn_offset
      tcg: Introduce get_jmp_target_addr
      tcg: Split out tcg_out_goto_tb
      tcg: Rename TB_JMP_RESET_OFFSET_INVALID to TB_JMP_OFFSET_INVALID
      tcg: Add gen_tb to TCGContext
      tcg: Add TranslationBlock.jmp_insn_offset
      tcg: Change tb_target_set_jmp_target arguments
      tcg: Move tb_target_set_jmp_target declaration to tcg.h
      tcg: Always define tb_target_set_jmp_target
      tcg: Remove TCG_TARGET_HAS_direct_jump
      tcg/aarch64: Reorg goto_tb implementation
      tcg/ppc: Reorg goto_tb implementation
      tcg/sparc64: Remove USE_REG_TB
      tcg/sparc64: Reorg goto_tb implementation
      tcg/arm: Implement direct branch for goto_tb
      tcg/riscv: Introduce OPC_NOP
      tcg/riscv: Implement direct branch for goto_tb

The INDEX_op_exit_tb opcode needs no register allocation.
Split out a dedicated helper function for it.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                        |  4 ++++
 tcg/aarch64/tcg-target.c.inc     | 22 ++++++++++--------
 tcg/arm/tcg-target.c.inc         | 11 +++++----
 tcg/i386/tcg-target.c.inc        | 21 +++++++++--------
 tcg/loongarch64/tcg-target.c.inc | 22 ++++++++++--------
 tcg/mips/tcg-target.c.inc        | 33 +++++++++++++--------------
 tcg/ppc/tcg-target.c.inc         | 11 +++++----
 tcg/riscv/tcg-target.c.inc       | 22 ++++++++++--------
 tcg/s390x/tcg-target.c.inc       | 23 ++++++++++---------
 tcg/sparc64/tcg-target.c.inc     | 39 +++++++++++++++++---------------
 tcg/tci/tcg-target.c.inc         | 10 ++++----
 11 files changed, 121 insertions(+), 97 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
 static void tcg_out_movi(TCGContext *s, TCGType type,
                          TCGReg ret, tcg_target_long arg);
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS]);
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
         case INDEX_op_call:
             tcg_reg_alloc_call(s, op);
             break;
+        case INDEX_op_exit_tb:
+            tcg_out_exit_tb(s, op->args[0]);
+            break;
         case INDEX_op_dup2_vec:
             if (tcg_reg_alloc_dup2(s, op)) {
                 break;
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
 
 static const tcg_insn_unit *tb_ret_addr;
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tcg_out_goto_long(s, tcg_code_gen_epilogue);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
+        tcg_out_goto_long(s, tb_ret_addr);
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        if (a0 == 0) {
-            tcg_out_goto_long(s, tcg_code_gen_epilogue);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
-            tcg_out_goto_long(s, tb_ret_addr);
-        }
-        break;
-
     case INDEX_op_goto_tb:
         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
         /*
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 
 static void tcg_out_epilogue(TCGContext *s);
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
+{
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, arg);
+    tcg_out_epilogue(s);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c;
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]);
-        tcg_out_epilogue(s);
-        break;
     case INDEX_op_goto_tb:
         {
             /* Indirect jump method */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 #endif
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tcg_out_jmp(s, tcg_code_gen_epilogue);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
+        tcg_out_jmp(s, tb_ret_addr);
+    }
+}
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg args[TCG_MAX_OP_ARGS],
                               const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     const_a2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        if (a0 == 0) {
-            tcg_out_jmp(s, tcg_code_gen_epilogue);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
-            tcg_out_jmp(s, tb_ret_addr);
-        }
-        break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_insn_offset) {
             /* direct jump method */
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
 
 static const tcg_insn_unit *tb_ret_addr;
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tcg_out_call_int(s, tcg_code_gen_epilogue, true);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
+        tcg_out_call_int(s, tb_ret_addr, true);
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        if (a0 == 0) {
-            tcg_out_call_int(s, tcg_code_gen_epilogue, true);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
-            tcg_out_call_int(s, tb_ret_addr, true);
-        }
-        break;
-
     case INDEX_op_goto_tb:
         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
         /*
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_clz(TCGContext *s, MIPSInsn opcv2, MIPSInsn opcv6,
     }
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    TCGReg b0 = TCG_REG_ZERO;
+
+    if (a0 & ~0xffff) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, a0 & ~0xffff);
+        b0 = TCG_REG_V0;
+    }
+    if (!tcg_out_opc_jmp(s, OPC_J, tb_ret_addr)) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, (uintptr_t)tb_ret_addr);
+        tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
+    }
+    tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        {
-            TCGReg b0 = TCG_REG_ZERO;
-
-            a0 = (intptr_t)a0;
-            if (a0 & ~0xffff) {
-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, a0 & ~0xffff);
-                b0 = TCG_REG_V0;
-            }
-            if (!tcg_out_opc_jmp(s, OPC_J, tb_ret_addr)) {
-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0,
-                             (uintptr_t)tb_ret_addr);
-                tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
-            }
-            tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
-        }
-        break;
     case INDEX_op_goto_tb:
         /* indirect jump method */
         tcg_debug_assert(s->tb_jmp_insn_offset == 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
     tcg_out32(s, BCLR | BO_ALWAYS);
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
+{
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, arg);
+    tcg_out_b(s, 0, tcg_code_gen_epilogue);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGArg a0, a1, a2;
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, args[0]);
-        tcg_out_b(s, 0, tcg_code_gen_epilogue);
-        break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_insn_offset) {
             /* Direct jump. */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:   /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:   /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
 
 static const tcg_insn_unit *tb_ret_addr;
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tcg_out_call_int(s, tcg_code_gen_epilogue, true);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
+        tcg_out_call_int(s, tb_ret_addr, true);
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        if (a0 == 0) {
-            tcg_out_call_int(s, tcg_code_gen_epilogue, true);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
-            tcg_out_call_int(s, tb_ret_addr, true);
-        }
-        break;
-
     case INDEX_op_goto_tb:
         assert(s->tb_jmp_insn_offset == 0);
         /* indirect jump method */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
 #endif
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tgen_gotoi(s, S390_CC_ALWAYS, tcg_code_gen_epilogue);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, a0);
+        tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
+    }
+}
+
 # define OP_32_64(x) \
         case glue(glue(INDEX_op_,x),_i32): \
         case glue(glue(INDEX_op_,x),_i64)
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGArg a0, a1, a2;
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        a0 = args[0];
-        if (a0 == 0) {
-            tgen_gotoi(s, S390_CC_ALWAYS, tcg_code_gen_epilogue);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, a0);
-            tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
-        }
-        break;
-
     case INDEX_op_goto_tb:
         a0 = args[0];
         /*
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
 #endif /* CONFIG_SOFTMMU */
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    if (check_fit_ptr(a0, 13)) {
+        tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+        tcg_out_movi_imm13(s, TCG_REG_O0, a0);
+        return;
+    } else if (USE_REG_TB) {
+        intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
+        if (check_fit_ptr(tb_diff, 13)) {
+            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+            /* Note that TCG_REG_TB has been unwound to O1.  */
+            tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O1, tb_diff, ARITH_ADD);
+            return;
+        }
+    }
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I0, a0 & ~0x3ff);
+    tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+    tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        if (check_fit_ptr(a0, 13)) {
-            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
-            tcg_out_movi_imm13(s, TCG_REG_O0, a0);
-            break;
-        } else if (USE_REG_TB) {
-            intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
-            if (check_fit_ptr(tb_diff, 13)) {
-                tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
-                /* Note that TCG_REG_TB has been unwound to O1.  */
-                tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O1, tb_diff, ARITH_ADD);
-                break;
-            }
-        }
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I0, a0 & ~0x3ff);
-        tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
-        tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
-        break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_insn_offset) {
             /* direct jump method */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *func,
 # define CASE_64(x)
 #endif
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
+{
+    tcg_out_op_p(s, INDEX_op_exit_tb, (void *)arg);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGOpcode exts;
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        tcg_out_op_p(s, opc, (void *)args[0]);
-        break;
-
     case INDEX_op_goto_tb:
         tcg_debug_assert(s->tb_jmp_insn_offset == 0);
         /* indirect jump method. */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        if (s->tb_jmp_insn_offset) {
-            /* direct jump method */
-            int gap;
-            /* jump displacement must be aligned for atomic patching;
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+        {
+            /*
+             * Jump displacement must be aligned for atomic patching;
              * see if we need to add extra nops before jump
              */
-            gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
+            int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
             if (gap != 1) {
                 tcg_out_nopn(s, gap - 1);
             }
             tcg_out8(s, OPC_JMP_long); /* jmp im */
             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
             tcg_out32(s, 0);
-        } else {
-            /* indirect jump method */
-            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
-                                 (intptr_t)(s->tb_jmp_target_addr + a0));
         }
         set_jmp_reset_offset(s, a0);
         break;
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        if (s->tb_jmp_insn_offset) {
-            /* Direct jump. */
-            if (TCG_TARGET_REG_BITS == 64) {
-                /* Ensure the next insns are 8 or 16-byte aligned. */
-                while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
-                    tcg_out32(s, NOP);
-                }
-                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
-                tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-                tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-            } else {
-                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
-                tcg_out32(s, B);
-                s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
-                break;
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+        /* Direct jump. */
+        if (TCG_TARGET_REG_BITS == 64) {
+            /* Ensure the next insns are 8 or 16-byte aligned. */
+            while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
+                tcg_out32(s, NOP);
             }
+            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+            tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
         } else {
-            /* Indirect jump. */
-            tcg_debug_assert(s->tb_jmp_insn_offset == NULL);
-            tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, 0,
-                       (intptr_t)(s->tb_jmp_insn_offset + args[0]));
+            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+            tcg_out32(s, B);
+            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+            break;
         }
         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
         tcg_out32(s, BCCTR | BO_ALWAYS);
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc64/tcg-target.c.inc | 41 +++++++++++-------------------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
     return false;
 }
 
-static void tcg_out_ld_ptr(TCGContext *s, TCGReg ret, const void *arg)
-{
-    intptr_t diff = tcg_tbrel_diff(s, arg);
-    if (USE_REG_TB && check_fit_ptr(diff, 13)) {
-        tcg_out_ld(s, TCG_TYPE_PTR, ret, TCG_REG_TB, diff);
-        return;
-    }
-    tcg_out_movi(s, TCG_TYPE_PTR, ret, (uintptr_t)arg & ~0x3ff);
-    tcg_out_ld(s, TCG_TYPE_PTR, ret, ret, (uintptr_t)arg & 0x3ff);
-}
-
 static void tcg_out_sety(TCGContext *s, TCGReg rs)
 {
     tcg_out32(s, WRY | INSN_RS1(TCG_REG_G0) | INSN_RS2(rs));
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        if (s->tb_jmp_insn_offset) {
-            /* direct jump method */
-            if (USE_REG_TB) {
-                /* make sure the patch is 8-byte aligned.  */
-                if ((intptr_t)s->code_ptr & 4) {
-                    tcg_out_nop(s);
-                }
-                s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
-                tcg_out_sethi(s, TCG_REG_T1, 0);
-                tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-                tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-                tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-            } else {
-                s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
-                tcg_out32(s, CALL);
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+        /* Direct jump. */
+        if (USE_REG_TB) {
+            /* make sure the patch is 8-byte aligned.  */
+            if ((intptr_t)s->code_ptr & 4) {
                 tcg_out_nop(s);
             }
+            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            tcg_out_sethi(s, TCG_REG_T1, 0);
+            tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+            tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
         } else {
-            /* indirect jump method */
-            tcg_out_ld_ptr(s, TCG_REG_TB, s->tb_jmp_target_addr + a0);
-            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_TB, 0, JMPL);
+            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            tcg_out32(s, CALL);
             tcg_out_nop(s);
         }
         set_jmp_reset_offset(s, a0);
-- 
2.34.1

Test TCG_TARGET_HAS_direct_jump instead of testing an
implementation pointer.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.c.inc     | 2 +-
 tcg/arm/tcg-target.c.inc         | 2 +-
 tcg/loongarch64/tcg-target.c.inc | 2 +-
 tcg/mips/tcg-target.c.inc        | 2 +-
 tcg/riscv/tcg-target.c.inc       | 2 +-
 tcg/tci/tcg-target.c.inc         | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /*
          * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
          * write can be used to patch the target address.
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
             intptr_t ptr, dif, dil;
             TCGReg base = TCG_REG_PC;
 
-            tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+            qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
             ptr = (intptr_t)tcg_splitwx_to_rx(s->tb_jmp_target_addr + args[0]);
             dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
             dil = sextract32(dif, 0, 12);
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /*
          * Ensure that patch area is 8-byte aligned so that an
          * atomic write can be used to patch the target address.
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     switch (opc) {
     case INDEX_op_goto_tb:
         /* indirect jump method */
-        tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
                    (uintptr_t)(s->tb_jmp_target_addr + a0));
         tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        assert(s->tb_jmp_insn_offset == 0);
+        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         /* indirect jump method */
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
                    (uintptr_t)(s->tb_jmp_target_addr + a0));
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         /* indirect jump method. */
         tcg_out_op_p(s, opc, s->tb_jmp_target_addr + args[0]);
         set_jmp_reset_offset(s, args[0]);
-- 
2.34.1

Similar to the existing set_jmp_reset_offset.  Move any assert for
TCG_TARGET_HAS_direct_jump into the new function (which now cannot
be build-time).  Will be unused if TCG_TARGET_HAS_direct_jump is
constant 0, but we can't test for constant in the preprocessor,
so just mark it G_GNUC_UNUSED.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                        | 10 ++++++++++
 tcg/aarch64/tcg-target.c.inc     |  3 +--
 tcg/i386/tcg-target.c.inc        |  3 +--
 tcg/loongarch64/tcg-target.c.inc |  3 +--
 tcg/ppc/tcg-target.c.inc         |  7 +++----
 tcg/s390x/tcg-target.c.inc       |  2 +-
 tcg/sparc64/tcg-target.c.inc     |  5 ++---
 7 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
     s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
 }
 
+static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
+{
+    /*
+     * We will check for overflow at the end of the opcode loop in
+     * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
+     */
+    tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
+    s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
+}
+
 /* Signal overflow, starting over with fewer guest insns. */
 static G_NORETURN
 void tcg_raise_tb_overflow(TCGContext *s)
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /*
          * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
          * write can be used to patch the target address.
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         if ((uintptr_t)s->code_ptr & 7) {
             tcg_out32(s, NOP);
         }
-        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+        set_jmp_insn_offset(s, a0);
         /*
          * actual branch destination will be patched by
          * tb_target_set_jmp_target later
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         {
             /*
              * Jump displacement must be aligned for atomic patching;
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                 tcg_out_nopn(s, gap - 1);
             }
             tcg_out8(s, OPC_JMP_long); /* jmp im */
-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, a0);
             tcg_out32(s, 0);
         }
         set_jmp_reset_offset(s, a0);
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /*
          * Ensure that patch area is 8-byte aligned so that an
          * atomic write can be used to patch the target address.
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         if ((uintptr_t)s->code_ptr & 7) {
             tcg_out_nop(s);
         }
-        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+        set_jmp_insn_offset(s, a0);
         /*
          * actual branch destination will be patched by
          * tb_target_set_jmp_target later
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /* Direct jump. */
         if (TCG_TARGET_REG_BITS == 64) {
             /* Ensure the next insns are 8 or 16-byte aligned. */
             while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
                 tcg_out32(s, NOP);
             }
-            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, args[0]);
             tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
             tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
         } else {
-            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, args[0]);
             tcg_out32(s, B);
-            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+            set_jmp_reset_offset(s, args[0]);
             break;
         }
         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
             tcg_out16(s, NOP);
         }
         tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
-        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+        set_jmp_insn_offset(s, a0);
         s->code_ptr += 2;
         set_jmp_reset_offset(s, a0);
         break;
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /* Direct jump. */
         if (USE_REG_TB) {
             /* make sure the patch is 8-byte aligned.  */
             if ((intptr_t)s->code_ptr & 4) {
                 tcg_out_nop(s);
             }
-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, a0);
             tcg_out_sethi(s, TCG_REG_T1, 0);
             tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
             tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
             tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
         } else {
-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, a0);
             tcg_out32(s, CALL);
             tcg_out_nop(s);
         }
-- 
2.34.1

Similar to the existing set_jmp_reset_offset.  Include the
rw->rx address space conversion done by arm and s390x, and
forgotten by mips and riscv.

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
     s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
 }
 
+static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
+{
+    /*
+     * Return the read-execute version of the pointer, for the benefit
+     * of any pc-relative addressing mode.
+     */
+    return (uintptr_t)tcg_splitwx_to_rx(&s->tb_jmp_target_addr[which]);
+}
+
 /* Signal overflow, starting over with fewer guest insns. */
 static G_NORETURN
 void tcg_raise_tb_overflow(TCGContext *s)
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
             TCGReg base = TCG_REG_PC;
 
             qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-            ptr = (intptr_t)tcg_splitwx_to_rx(s->tb_jmp_target_addr + args[0]);
+            ptr = get_jmp_target_addr(s, args[0]);
             dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
             dil = sextract32(dif, 0, 12);
             if (dif != dil) {
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         /* indirect jump method */
         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
-                   (uintptr_t)(s->tb_jmp_target_addr + a0));
+                   get_jmp_target_addr(s, a0));
         tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
         tcg_out_nop(s);
         set_jmp_reset_offset(s, a0);
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         /* indirect jump method */
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
-                   (uintptr_t)(s->tb_jmp_target_addr + a0));
+                   get_jmp_target_addr(s, a0));
         tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
         set_jmp_reset_offset(s, a0);
         break;
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_goto_tb:
         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         /* indirect jump method. */
-        tcg_out_op_p(s, opc, s->tb_jmp_target_addr + args[0]);
+        tcg_out_op_p(s, opc, (void *)get_jmp_target_addr(s, args[0]));
         set_jmp_reset_offset(s, args[0]);
         break;
 
-- 
2.34.1

The INDEX_op_goto_tb opcode needs no register allocation.
Split out a dedicated helper function for it.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                        |  4 ++
 tcg/aarch64/tcg-target.c.inc     | 40 ++++++++++---------
 tcg/arm/tcg-target.c.inc         | 49 ++++++++++++-----------
 tcg/i386/tcg-target.c.inc        | 33 ++++++++--------
 tcg/loongarch64/tcg-target.c.inc | 38 +++++++++---------
 tcg/mips/tcg-target.c.inc        | 21 +++++-----
 tcg/ppc/tcg-target.c.inc         | 52 ++++++++++++------------
 tcg/riscv/tcg-target.c.inc       | 20 +++++-----
 tcg/s390x/tcg-target.c.inc       | 31 ++++++++-------
 tcg/sparc64/tcg-target.c.inc     | 68 +++++++++++++++++---------------
 tcg/tci/tcg-target.c.inc         | 16 ++++----
 11 files changed, 199 insertions(+), 173 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
 static void tcg_out_movi(TCGContext *s, TCGType type,
                          TCGReg ret, tcg_target_long arg);
 static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
+static void tcg_out_goto_tb(TCGContext *s, int which);
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS]);
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
         case INDEX_op_exit_tb:
             tcg_out_exit_tb(s, op->args[0]);
             break;
+        case INDEX_op_goto_tb:
+            tcg_out_goto_tb(s, op->args[0]);
+            break;
         case INDEX_op_dup2_vec:
             if (tcg_reg_alloc_dup2(s, op)) {
                 break;
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /*
+     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
+     * write can be used to patch the target address.
+     */
+    if ((uintptr_t)s->code_ptr & 7) {
+        tcg_out32(s, NOP);
+    }
+    set_jmp_insn_offset(s, which);
+    /*
+     * actual branch destination will be patched by
+     * tb_target_set_jmp_target later
+     */
+    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
+    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
+    tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /*
-         * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
-         * write can be used to patch the target address.
-         */
-        if ((uintptr_t)s->code_ptr & 7) {
-            tcg_out32(s, NOP);
-        }
-        set_jmp_insn_offset(s, a0);
-        /*
-         * actual branch destination will be patched by
-         * tb_target_set_jmp_target later
-         */
-        tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
-        tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
-        tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
-        set_jmp_reset_offset(s, a0);
-        break;
-
     case INDEX_op_goto_ptr:
         tcg_out_insn(s, 3207, BR, a0);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
     tcg_out_epilogue(s);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /* Indirect jump method */
+    intptr_t ptr, dif, dil;
+    TCGReg base = TCG_REG_PC;
+
+    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+    ptr = get_jmp_target_addr(s, which);
+    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
+    dil = sextract32(dif, 0, 12);
+    if (dif != dil) {
+        /*
+         * The TB is close, but outside the 12 bits addressable by
+         * the load.  We can extend this to 20 bits with a sub of a
+         * shifted immediate from pc.  In the vastly unlikely event
+         * the code requires more than 1MB, we'll use 2 insns and
+         * be no worse off.
+         */
+        base = TCG_REG_R0;
+        tcg_out_movi32(s, COND_AL, base, ptr - dil);
+    }
+    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c;
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        {
-            /* Indirect jump method */
-            intptr_t ptr, dif, dil;
-            TCGReg base = TCG_REG_PC;
-
-            qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-            ptr = get_jmp_target_addr(s, args[0]);
-            dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
-            dil = sextract32(dif, 0, 12);
-            if (dif != dil) {
-                /* The TB is close, but outside the 12 bits addressable by
-                   the load.  We can extend this to 20 bits with a sub of a
-                   shifted immediate from pc.  In the vastly unlikely event
-                   the code requires more than 1MB, we'll use 2 insns and
-                   be no worse off.  */
-                base = TCG_REG_R0;
-                tcg_out_movi32(s, COND_AL, base, ptr - dil);
-            }
-            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
-            set_jmp_reset_offset(s, args[0]);
-        }
-        break;
     case INDEX_op_goto_ptr:
         tcg_out_b_reg(s, COND_AL, args[0]);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /*
+     * Jump displacement must be aligned for atomic patching;
+     * see if we need to add extra nops before jump
+     */
+    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
+    if (gap != 1) {
+        tcg_out_nopn(s, gap - 1);
+    }
+    tcg_out8(s, OPC_JMP_long); /* jmp im */
+    set_jmp_insn_offset(s, which);
+    tcg_out32(s, 0);
+    set_jmp_reset_offset(s, which);
+}
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg args[TCG_MAX_OP_ARGS],
                               const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     const_a2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        {
-            /*
-             * Jump displacement must be aligned for atomic patching;
-             * see if we need to add extra nops before jump
-             */
-            int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
-            if (gap != 1) {
-                tcg_out_nopn(s, gap - 1);
-            }
-            tcg_out8(s, OPC_JMP_long); /* jmp im */
-            set_jmp_insn_offset(s, a0);
-            tcg_out32(s, 0);
-        }
-        set_jmp_reset_offset(s, a0);
-        break;
     case INDEX_op_goto_ptr:
         /* jmp to the given host address (could be epilogue) */
         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /*
+     * Ensure that patch area is 8-byte aligned so that an
+     * atomic write can be used to patch the target address.
+     */
+    if ((uintptr_t)s->code_ptr & 7) {
+        tcg_out_nop(s);
+    }
+    set_jmp_insn_offset(s, which);
+    /*
+     * actual branch destination will be patched by
+     * tb_target_set_jmp_target later
+     */
+    tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
+    tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /*
-         * Ensure that patch area is 8-byte aligned so that an
-         * atomic write can be used to patch the target address.
-         */
-        if ((uintptr_t)s->code_ptr & 7) {
-            tcg_out_nop(s);
-        }
-        set_jmp_insn_offset(s, a0);
-        /*
-         * actual branch destination will be patched by
-         * tb_target_set_jmp_target later
-         */
-        tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
-        tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
-        set_jmp_reset_offset(s, a0);
-        break;
-
     case INDEX_op_mb:
         tcg_out_mb(s, a0);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /* indirect jump method */
+    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
+               get_jmp_target_addr(s, which));
+    tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
+    tcg_out_nop(s);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /* indirect jump method */
-        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
-                   get_jmp_target_addr(s, a0));
-        tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
-        tcg_out_nop(s);
-        set_jmp_reset_offset(s, a0);
-        break;
     case INDEX_op_goto_ptr:
         /* jmp to the given host address (could be epilogue) */
         tcg_out_opc_reg(s, OPC_JR, 0, a0, 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
     tcg_out_b(s, 0, tcg_code_gen_epilogue);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /* Direct jump. */
+    if (TCG_TARGET_REG_BITS == 64) {
+        /* Ensure the next insns are 8 or 16-byte aligned. */
+        while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
+            tcg_out32(s, NOP);
+        }
+        set_jmp_insn_offset(s, which);
+        tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
+        tcg_out32(s, BCCTR | BO_ALWAYS);
+        set_jmp_reset_offset(s, which);
+        if (USE_REG_TB) {
+            /* For the unlinked case, need to reset TCG_REG_TB.  */
+            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
+                             -tcg_current_code_size(s));
+        }
+    } else {
+        set_jmp_insn_offset(s, which);
+        tcg_out32(s, B);
+        set_jmp_reset_offset(s, which);
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGArg a0, a1, a2;
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /* Direct jump. */
-        if (TCG_TARGET_REG_BITS == 64) {
-            /* Ensure the next insns are 8 or 16-byte aligned. */
-            while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
-                tcg_out32(s, NOP);
-            }
-            set_jmp_insn_offset(s, args[0]);
-            tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-        } else {
-            set_jmp_insn_offset(s, args[0]);
-            tcg_out32(s, B);
-            set_jmp_reset_offset(s, args[0]);
-            break;
-        }
-        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-        set_jmp_reset_offset(s, args[0]);
-        if (USE_REG_TB) {
-            /* For the unlinked case, need to reset TCG_REG_TB.  */
-            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
-                             -tcg_current_code_size(s));
-        }
-        break;
     case INDEX_op_goto_ptr:
         tcg_out32(s, MTSPR | RS(args[0]) | CTR);
         if (USE_REG_TB) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:   /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:   /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+    /* indirect jump method */
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
+               get_jmp_target_addr(s, which));
+    tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-        /* indirect jump method */
-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
-                   get_jmp_target_addr(s, a0));
-        tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
-        set_jmp_reset_offset(s, a0);
-        break;
-
     case INDEX_op_goto_ptr:
         tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, a0, 0);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /*
+     * Branch displacement must be aligned for atomic patching;
+     * see if we need to add extra nop before branch
+     */
+    if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
+        tcg_out16(s, NOP);
+    }
+    tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
+    set_jmp_insn_offset(s, which);
+    s->code_ptr += 2;
+    set_jmp_reset_offset(s, which);
+}
+
 # define OP_32_64(x) \
         case glue(glue(INDEX_op_,x),_i32): \
         case glue(glue(INDEX_op_,x),_i64)
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGArg a0, a1, a2;
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        a0 = args[0];
-        /*
-         * branch displacement must be aligned for atomic patching;
-         * see if we need to add extra nop before branch
-         */
-        if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
-            tcg_out16(s, NOP);
-        }
-        tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
-        set_jmp_insn_offset(s, a0);
-        s->code_ptr += 2;
-        set_jmp_reset_offset(s, a0);
-        break;
-
     case INDEX_op_goto_ptr:
         a0 = args[0];
         tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, a0);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /* Direct jump. */
+    if (USE_REG_TB) {
+        /* make sure the patch is 8-byte aligned.  */
+        if ((intptr_t)s->code_ptr & 4) {
+            tcg_out_nop(s);
+        }
+        set_jmp_insn_offset(s, which);
+        tcg_out_sethi(s, TCG_REG_T1, 0);
+        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+        tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+    } else {
+        set_jmp_insn_offset(s, which);
+        tcg_out32(s, CALL);
+        tcg_out_nop(s);
+    }
+    set_jmp_reset_offset(s, which);
+
+    /*
+     * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
+     * to the beginning of this TB.
+     */
+    if (USE_REG_TB) {
+        int c = -tcg_current_code_size(s);
+        if (check_fit_i32(c, 13)) {
+            tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
+        } else {
+            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
+            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+        }
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /* Direct jump. */
-        if (USE_REG_TB) {
-            /* make sure the patch is 8-byte aligned.  */
-            if ((intptr_t)s->code_ptr & 4) {
-                tcg_out_nop(s);
-            }
-            set_jmp_insn_offset(s, a0);
-            tcg_out_sethi(s, TCG_REG_T1, 0);
-            tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-            tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-        } else {
-            set_jmp_insn_offset(s, a0);
-            tcg_out32(s, CALL);
-            tcg_out_nop(s);
-        }
-        set_jmp_reset_offset(s, a0);
-
-        /* For the unlinked path of goto_tb, we need to reset
-           TCG_REG_TB to the beginning of this TB.  */
-        if (USE_REG_TB) {
-            c = -tcg_current_code_size(s);
-            if (check_fit_i32(c, 13)) {
-                tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
-            } else {
-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
-                tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB,
-                              TCG_REG_T1, ARITH_ADD);
-            }
-        }
-        break;
     case INDEX_op_goto_ptr:
         tcg_out_arithi(s, TCG_REG_G0, a0, 0, JMPL);
         if (USE_REG_TB) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
     tcg_out_op_p(s, INDEX_op_exit_tb, (void *)arg);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+    /* indirect jump method. */
+    tcg_out_op_p(s, INDEX_op_goto_tb, (void *)get_jmp_target_addr(s, which));
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGOpcode exts;
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-        /* indirect jump method. */
-        tcg_out_op_p(s, opc, (void *)get_jmp_target_addr(s, args[0]));
-        set_jmp_reset_offset(s, args[0]);
-        break;
-
     case INDEX_op_goto_ptr:
         tcg_out_op_r(s, opc, args[0]);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
-- 
2.34.1

This will shortly be used for more than reset.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h   | 2 +-
 accel/tcg/translate-all.c | 8 ++++----
 tcg/tcg.c                 | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
      * setting one of the jump targets (or patching the jump instruction). Only
      * two of such jumps are supported.
      */
+#define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
     uint16_t jmp_reset_offset[2]; /* offset of original jump target */
-#define TB_JMP_RESET_OFFSET_INVALID 0xffff /* indicates no jump generated */
     uintptr_t jmp_target_arg[2];  /* target address or offset */
 
     /*
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     tb->jmp_dest[1] = (uintptr_t)NULL;
 
     /* init original jump addresses which have been set during tcg_gen_code() */
-    if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
+    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
         tb_reset_jump(tb, 0);
     }
-    if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
+    if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
         tb_reset_jump(tb, 1);
     }
 
@@ -XXX,XX +XXX,XX @@ static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
     if (tb_page_addr1(tb) != -1) {
         tst->cross_page++;
     }
-    if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
+    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
         tst->direct_jmp_count++;
-        if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
+        if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
             tst->direct_jmp2_count++;
         }
     }
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
 #endif
 
     /* Initialize goto_tb jump offsets. */
-    tb->jmp_reset_offset[0] = TB_JMP_RESET_OFFSET_INVALID;
-    tb->jmp_reset_offset[1] = TB_JMP_RESET_OFFSET_INVALID;
+    tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
+    tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
     tcg_ctx->tb_jmp_reset_offset = tb->jmp_reset_offset;
     if (TCG_TARGET_HAS_direct_jump) {
         tcg_ctx->tb_jmp_insn_offset = tb->jmp_target_arg;
-- 
2.34.1

This can replace four other variables that are references
into the TranslationBlock structure.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h         | 11 +++--------
 accel/tcg/translate-all.c |  2 +-
 tcg/tcg-op.c              | 14 +++++++-------
 tcg/tcg.c                 | 14 +++-----------
 4 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
     int nb_indirects;
     int nb_ops;
 
-    /* goto_tb support */
-    tcg_insn_unit *code_buf;
-    uint16_t *tb_jmp_reset_offset; /* tb->jmp_reset_offset */
-    uintptr_t *tb_jmp_insn_offset; /* tb->jmp_target_arg if direct_jump */
-    uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_arg if !direct_jump */
-
     TCGRegSet reserved_regs;
-    uint32_t tb_cflags; /* cflags of the current TB */
     intptr_t current_frame_offset;
     intptr_t frame_start;
     intptr_t frame_end;
     TCGTemp *frame_temp;
 
-    tcg_insn_unit *code_ptr;
+    TranslationBlock *gen_tb;     /* tb for which code is being generated */
+    tcg_insn_unit *code_buf;      /* pointer for start of tb */
+    tcg_insn_unit *code_ptr;      /* pointer for running end of tb */
 
 #ifdef CONFIG_PROFILER
     TCGProfile prof;
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     tb->trace_vcpu_dstate = *cpu->trace_dstate;
     tb_set_page_addr0(tb, phys_pc);
     tb_set_page_addr1(tb, -1);
-    tcg_ctx->tb_cflags = cflags;
+    tcg_ctx->gen_tb = tb;
  tb_overflow:
 
 #ifdef CONFIG_PROFILER
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_op6(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3,
 
 void tcg_gen_mb(TCGBar mb_type)
 {
-    if (tcg_ctx->tb_cflags & CF_PARALLEL) {
+    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {
         tcg_gen_op1(INDEX_op_mb, mb_type);
     }
 }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_exit_tb(const TranslationBlock *tb, unsigned idx)
 void tcg_gen_goto_tb(unsigned idx)
 {
     /* We tested CF_NO_GOTO_TB in translator_use_goto_tb. */
-    tcg_debug_assert(!(tcg_ctx->tb_cflags & CF_NO_GOTO_TB));
+    tcg_debug_assert(!(tcg_ctx->gen_tb->cflags & CF_NO_GOTO_TB));
     /* We only support two chained exits.  */
     tcg_debug_assert(idx <= TB_EXIT_IDXMAX);
 #ifdef CONFIG_DEBUG_TCG
@@ -XXX,XX +XXX,XX @@ void tcg_gen_lookup_and_goto_ptr(void)
 {
     TCGv_ptr ptr;
 
-    if (tcg_ctx->tb_cflags & CF_NO_GOTO_PTR) {
+    if (tcg_ctx->gen_tb->cflags & CF_NO_GOTO_PTR) {
         tcg_gen_exit_tb(NULL, 0);
         return;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
 {
     memop = tcg_canonicalize_memop(memop, 0, 0);
 
-    if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) {
+    if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) {
         TCGv_i32 t1 = tcg_temp_new_i32();
         TCGv_i32 t2 = tcg_temp_new_i32();
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
 {
     memop = tcg_canonicalize_memop(memop, 1, 0);
 
-    if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) {
+    if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) {
         TCGv_i64 t1 = tcg_temp_new_i64();
         TCGv_i64 t2 = tcg_temp_new_i64();
 
@@ -XXX,XX +XXX,XX @@ static void * const table_##NAME[(MO_SIZE | MO_BSWAP) + 1] = {          \
 void tcg_gen_atomic_##NAME##_i32                                        \
     (TCGv_i32 ret, TCGv addr, TCGv_i32 val, TCGArg idx, MemOp memop)    \
 {                                                                       \
-    if (tcg_ctx->tb_cflags & CF_PARALLEL) {                             \
+    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {                        \
         do_atomic_op_i32(ret, addr, val, idx, memop, table_##NAME);     \
     } else {                                                            \
         do_nonatomic_op_i32(ret, addr, val, idx, memop, NEW,            \
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_##NAME##_i32                                        \
 void tcg_gen_atomic_##NAME##_i64                                        \
     (TCGv_i64 ret, TCGv addr, TCGv_i64 val, TCGArg idx, MemOp memop)    \
 {                                                                       \
-    if (tcg_ctx->tb_cflags & CF_PARALLEL) {                             \
+    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {                        \
         do_atomic_op_i64(ret, addr, val, idx, memop, table_##NAME);     \
     } else {                                                            \
         do_nonatomic_op_i64(ret, addr, val, idx, memop, NEW,            \
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
      * We will check for overflow at the end of the opcode loop in
      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
      */
-    s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
+    s->gen_tb->jmp_reset_offset[which] = tcg_current_code_size(s);
 }
 
 static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
      */
     tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
-    s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
+    s->gen_tb->jmp_target_arg[which] = tcg_current_code_size(s);
 }
 
 static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
      * Return the read-execute version of the pointer, for the benefit
      * of any pc-relative addressing mode.
      */
-    return (uintptr_t)tcg_splitwx_to_rx(&s->tb_jmp_target_addr[which]);
+    return (uintptr_t)tcg_splitwx_to_rx(s->gen_tb->jmp_target_arg + which);
 }
 
 /* Signal overflow, starting over with fewer guest insns. */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
     /* Initialize goto_tb jump offsets. */
     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
     tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
-    tcg_ctx->tb_jmp_reset_offset = tb->jmp_reset_offset;
-    if (TCG_TARGET_HAS_direct_jump) {
-        tcg_ctx->tb_jmp_insn_offset = tb->jmp_target_arg;
-        tcg_ctx->tb_jmp_target_addr = NULL;
-    } else {
-        tcg_ctx->tb_jmp_insn_offset = NULL;
-        tcg_ctx->tb_jmp_target_addr = tb->jmp_target_arg;
-    }
 
     tcg_reg_alloc_start(s);
 
-- 
2.34.1

Stop overloading jmp_target_arg for both offset and address,
depending on TCG_TARGET_HAS_direct_jump.  Instead, add a new
field to hold the jump insn offset and always set the target
address in jmp_target_addr[].  This will allow a tcg backend
to use either direct or indirect depending on displacement.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h | 3 ++-
 accel/tcg/cpu-exec.c    | 5 ++---
 tcg/tcg.c               | 6 ++++--
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
      */
 #define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
     uint16_t jmp_reset_offset[2]; /* offset of original jump target */
-    uintptr_t jmp_target_arg[2];  /* target address or offset */
+    uint16_t jmp_insn_offset[2];  /* offset of direct jump insn */
+    uintptr_t jmp_target_addr[2]; /* target address */
 
     /*
      * Each TB has a NULL-terminated list (jmp_list_head) of incoming jumps.
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
 
 void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
 {
+    tb->jmp_target_addr[n] = addr;
     if (TCG_TARGET_HAS_direct_jump) {
-        uintptr_t offset = tb->jmp_target_arg[n];
+        uintptr_t offset = tb->jmp_insn_offset[n];
         uintptr_t tc_ptr = (uintptr_t)tb->tc.ptr;
         uintptr_t jmp_rx = tc_ptr + offset;
         uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
         tb_target_set_jmp_target(tc_ptr, jmp_rx, jmp_rw, addr);
-    } else {
-        tb->jmp_target_arg[n] = addr;
     }
 }
 
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
      */
     tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
-    s->gen_tb->jmp_target_arg[which] = tcg_current_code_size(s);
+    s->gen_tb->jmp_insn_offset[which] = tcg_current_code_size(s);
 }
 
 static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
      * Return the read-execute version of the pointer, for the benefit
      * of any pc-relative addressing mode.
      */
-    return (uintptr_t)tcg_splitwx_to_rx(s->gen_tb->jmp_target_arg + which);
+    return (uintptr_t)tcg_splitwx_to_rx(&s->gen_tb->jmp_target_addr[which]);
 }
 
 /* Signal overflow, starting over with fewer guest insns. */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
     /* Initialize goto_tb jump offsets. */
     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
     tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
+    tb->jmp_insn_offset[0] = TB_JMP_OFFSET_INVALID;
+    tb->jmp_insn_offset[1] = TB_JMP_OFFSET_INVALID;
 
     tcg_reg_alloc_start(s);
 
-- 
2.34.1

Replace 'tc_ptr' and 'addr' with 'tb' and 'n'.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.h         |  3 ++-
 tcg/arm/tcg-target.h             |  3 ++-
 tcg/i386/tcg-target.h            |  9 ++-------
 tcg/loongarch64/tcg-target.h     |  3 ++-
 tcg/mips/tcg-target.h            |  3 ++-
 tcg/ppc/tcg-target.h             |  3 ++-
 tcg/riscv/tcg-target.h           |  3 ++-
 tcg/s390x/tcg-target.h           | 10 ++--------
 tcg/sparc64/tcg-target.h         |  3 ++-
 tcg/tci/tcg-target.h             |  3 ++-
 accel/tcg/cpu-exec.c             | 11 ++++++++---
 tcg/aarch64/tcg-target.c.inc     |  5 +++--
 tcg/i386/tcg-target.c.inc        |  9 +++++++++
 tcg/loongarch64/tcg-target.c.inc |  5 +++--
 tcg/ppc/tcg-target.c.inc         |  7 ++++---
 tcg/s390x/tcg-target.c.inc       | 10 ++++++++++
 tcg/sparc64/tcg-target.c.inc     |  7 ++++---
 17 files changed, 61 insertions(+), 36 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     0
 
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *, int,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_MEMORY_BSWAP     0
 
 /* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
 #define TCG_TARGET_extract_i64_valid(ofs, len) \
     (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
 
-static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                                            uintptr_t jmp_rw, uintptr_t addr)
-{
-    /* patch the branch destination */
-    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
-    /* no need to flush icache explicitly */
-}
+void tb_target_set_jmp_target(const TranslationBlock *, int,
+                              uintptr_t, uintptr_t);
 
 /* This defines the natural memory order supported by this
  * architecture before guarantees made by various barrier
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1
 
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_DEFAULT_MO (0)
 
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
 /* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t)
     QEMU_ERROR("code path is reachable");
 
 #define TCG_TARGET_NEED_LDST_LABELS
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_bitsel_vec       have_vsx
 #define TCG_TARGET_HAS_cmpsel_vec       0
 
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #endif
 
 /* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_DEFAULT_MO (0)
 
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
 
 #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
 
-static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                                            uintptr_t jmp_rw, uintptr_t addr)
-{
-    /* patch the branch destination */
-    intptr_t disp = addr - (jmp_rx - 2);
-    qatomic_set((int32_t *)jmp_rw, disp / 2);
-    /* no need to flush icache explicitly */
-}
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw);
 
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
 /* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #endif /* TCG_TARGET_H */
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
 {
     tb->jmp_target_addr[n] = addr;
     if (TCG_TARGET_HAS_direct_jump) {
+        /*
+         * Get the rx view of the structure, from which we find the
+         * executable code address, and tb_target_set_jmp_target can
+         * produce a pc-relative displacement to jmp_target_addr[n].
+         */
+        const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
         uintptr_t offset = tb->jmp_insn_offset[n];
-        uintptr_t tc_ptr = (uintptr_t)tb->tc.ptr;
-        uintptr_t jmp_rx = tc_ptr + offset;
+        uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
         uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
-        tb_target_set_jmp_target(tc_ptr, jmp_rx, jmp_rw, addr);
+        tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
     }
 }
 
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
     tcg_out_call_int(s, target);
 }
 
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                              uintptr_t jmp_rw, uintptr_t addr)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
+    uintptr_t addr = tb->jmp_target_addr[n];
     tcg_insn_unit i1, i2;
     TCGType rt = TCG_TYPE_I64;
     TCGReg  rd = TCG_REG_TMP;
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* patch the branch destination */
+    uintptr_t addr = tb->jmp_target_addr[n];
+    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
+    /* no need to flush icache explicitly */
+}
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg args[TCG_MAX_OP_ARGS],
                               const int const_args[TCG_MAX_OP_ARGS])
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop(TCGContext *s)
     tcg_out32(s, NOP);
 }
 
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                              uintptr_t jmp_rw, uintptr_t addr)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
     tcg_insn_unit i1, i2;
     ptrdiff_t upper, lower;
+    uintptr_t addr = tb->jmp_target_addr[n];
     ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
 
     if (offset == sextreg(offset, 0, 26)) {
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
     flush_idcache_range(rx, rw, 16);
 }
 
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                              uintptr_t jmp_rw, uintptr_t addr)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
     tcg_insn_unit i0, i1, i2, i3;
-    intptr_t tb_diff = addr - tc_ptr;
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t tb_diff = addr - (uintptr_t)tb->tc.ptr;
     intptr_t br_diff = addr - (jmp_rx + 4);
     intptr_t lo, hi;
 
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* patch the branch destination */
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t disp = addr - (jmp_rx - 2);
+    qatomic_set((int32_t *)jmp_rw, disp / 2);
+    /* no need to flush icache explicitly */
+}
+
 # define OP_32_64(x) \
         case glue(glue(INDEX_op_,x),_i32): \
         case glue(glue(INDEX_op_,x),_i64)
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tcg_register_jit(const void *buf, size_t buf_size)
     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
 }
 
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                              uintptr_t jmp_rw, uintptr_t addr)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
-    intptr_t tb_disp = addr - tc_ptr;
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t tb_disp = addr - (uintptr_t)tb->tc.ptr;
     intptr_t br_disp = addr - jmp_rx;
     tcg_insn_unit i1, i2;
 
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h            | 3 +++
 tcg/aarch64/tcg-target.h     | 4 ----
 tcg/arm/tcg-target.h         | 5 -----
 tcg/i386/tcg-target.h        | 3 ---
 tcg/loongarch64/tcg-target.h | 3 ---
 tcg/mips/tcg-target.h        | 5 -----
 tcg/ppc/tcg-target.h         | 4 ----
 tcg/riscv/tcg-target.h       | 4 ----
 tcg/s390x/tcg-target.h       | 4 ----
 tcg/sparc64/tcg-target.h     | 4 ----
 tcg/tci/tcg-target.h         | 4 ----
 11 files changed, 3 insertions(+), 40 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s);
 
 int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start);
 
+void tb_target_set_jmp_target(const TranslationBlock *, int,
+                              uintptr_t, uintptr_t);
+
 void tcg_set_frame(TCGContext *s, TCGReg reg, intptr_t start, intptr_t size);
 
 TCGTemp *tcg_global_mem_new_internal(TCGType, TCGv_ptr,
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     0
-
-void tb_target_set_jmp_target(const TranslationBlock *, int,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     0
-
-/* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
 #define TCG_TARGET_extract_i64_valid(ofs, len) \
     (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
 
-void tb_target_set_jmp_target(const TranslationBlock *, int,
-                              uintptr_t, uintptr_t);
-
 /* This defines the natural memory order supported by this
  * architecture before guarantees made by various barrier
  * instructions.
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1
 
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_DEFAULT_MO (0)
 
 #define TCG_TARGET_NEED_LDST_LABELS
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
-/* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t)
-    QEMU_ERROR("code path is reachable");
-
 #define TCG_TARGET_NEED_LDST_LABELS
 
 #endif
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_bitsel_vec       have_vsx
 #define TCG_TARGET_HAS_cmpsel_vec       0
 
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
-
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_mulsh_i64        1
 #endif
 
-/* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_DEFAULT_MO (0)
 
 #define TCG_TARGET_NEED_LDST_LABELS
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
 #define TCG_TARGET_HAS_MEMORY_BSWAP   1
 
 #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw);
-
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_NEED_POOL_LABELS
 
 #endif
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
-/* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #endif /* TCG_TARGET_H */
-- 
2.34.1

Install empty versions for !TCG_TARGET_HAS_direct_jump hosts.

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* Always indirect, nothing to do */
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* Always indirect, nothing to do */
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* Always indirect, nothing to do */
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* Always indirect, nothing to do */
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
-- 
2.34.1

We now have the option to generate direct or indirect
goto_tb depending on the dynamic displacement, thus
the define is no longer necessary or completely accurate.

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_muls2_i64        0
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1
-#define TCG_TARGET_HAS_direct_jump      1
 
 #define TCG_TARGET_HAS_v64              1
 #define TCG_TARGET_HAS_v128             1
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_mulsh_i32        0
 #define TCG_TARGET_HAS_div_i32          use_idiv_instructions
 #define TCG_TARGET_HAS_rem_i32          0
-#define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #define TCG_TARGET_HAS_v64              use_neon_instructions
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
 #define TCG_TARGET_HAS_muls2_i32        1
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_direct_jump      1
 
 #if TCG_TARGET_REG_BITS == 64
 /* Keep target addresses zero-extended in a register.  */
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_clz_i32          1
 #define TCG_TARGET_HAS_ctz_i32          1
 #define TCG_TARGET_HAS_ctpop_i32        0
-#define TCG_TARGET_HAS_direct_jump      1
 #define TCG_TARGET_HAS_brcond2          0
 #define TCG_TARGET_HAS_setcond2         0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_muluh_i32        1
 #define TCG_TARGET_HAS_mulsh_i32        1
 #define TCG_TARGET_HAS_bswap32_i32      1
-#define TCG_TARGET_HAS_direct_jump      0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_add2_i32         0
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_muls2_i32        0
 #define TCG_TARGET_HAS_muluh_i32        1
 #define TCG_TARGET_HAS_mulsh_i32        1
-#define TCG_TARGET_HAS_direct_jump      1
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #if TCG_TARGET_REG_BITS == 64
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_clz_i32          0
 #define TCG_TARGET_HAS_ctz_i32          0
 #define TCG_TARGET_HAS_ctpop_i32        0
-#define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_brcond2          1
 #define TCG_TARGET_HAS_setcond2         1
 #define TCG_TARGET_HAS_qemu_st8_i32     0
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
 #define TCG_TARGET_HAS_mulsh_i32      0
 #define TCG_TARGET_HAS_extrl_i64_i32  0
 #define TCG_TARGET_HAS_extrh_i64_i32  0
-#define TCG_TARGET_HAS_direct_jump    1
 #define TCG_TARGET_HAS_qemu_st8_i32   0
 
 #define TCG_TARGET_HAS_div2_i64       1
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
 #define TCG_TARGET_HAS_muls2_i32        1
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_direct_jump      1
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #define TCG_TARGET_HAS_extrl_i64_i32    1
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_TARGET_HAS_muls2_i32        1
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #if TCG_TARGET_REG_BITS == 64
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
 
 void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
 {
+    /*
+     * Get the rx view of the structure, from which we find the
+     * executable code address, and tb_target_set_jmp_target can
+     * produce a pc-relative displacement to jmp_target_addr[n].
+     */
+    const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
+    uintptr_t offset = tb->jmp_insn_offset[n];
+    uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
+    uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
+
     tb->jmp_target_addr[n] = addr;
-    if (TCG_TARGET_HAS_direct_jump) {
-        /*
-         * Get the rx view of the structure, from which we find the
-         * executable code address, and tb_target_set_jmp_target can
-         * produce a pc-relative displacement to jmp_target_addr[n].
-         */
-        const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
-        uintptr_t offset = tb->jmp_insn_offset[n];
-        uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
-        uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
-        tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
-    }
+    tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
 }
 
 static inline void tb_add_jump(TranslationBlock *tb, int n,
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
      * We will check for overflow at the end of the opcode loop in
      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
      */
-    tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
     s->gen_tb->jmp_insn_offset[which] = tcg_current_code_size(s);
 }
 
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     intptr_t ptr, dif, dil;
     TCGReg base = TCG_REG_PC;
 
-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
     ptr = get_jmp_target_addr(s, which);
     dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
     dil = sextract32(dif, 0, 12);
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
     /* indirect jump method */
-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
                get_jmp_target_addr(s, which));
     tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
     /* indirect jump method */
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
                get_jmp_target_addr(s, which));
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
                               uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
+    if (!HAVE_FACILITY(GEN_INST_EXT)) {
+        return;
+    }
     /* patch the branch destination */
     uintptr_t addr = tb->jmp_target_addr[n];
     intptr_t disp = addr - (jmp_rx - 2);
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
     /* indirect jump method. */
     tcg_out_op_p(s, INDEX_op_goto_tb, (void *)get_jmp_target_addr(s, which));
     set_jmp_reset_offset(s, which);
-- 
2.34.1

The old implementation replaces two insns, swapping between

b	<dest>
	nop
	br	x30
and
	adrp	x30, <dest>
	addi	x30, x30, lo12:<dest>
	br	x30

There is a race condition in which a thread could be stopped at
the PC of the second insn, and when restarted does not see the
complete address computation and branches to nowhere.

The new implemetation replaces only one insn, swapping between

b	<dest>
	br	tmp
and
	ldr	tmp, <jmp_addr>
	br	tmp

Reported-by: hev <r@hev.cc>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.h     |  2 +-
 tcg/aarch64/tcg-target.c.inc | 66 +++++++++++++++---------------------
 2 files changed, 29 insertions(+), 39 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 
 #define TCG_TARGET_INSN_UNIT_SIZE  4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
-#define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
+#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 
 typedef enum {
     TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
     tcg_out_call_int(s, target);
 }
 
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-{
-    uintptr_t addr = tb->jmp_target_addr[n];
-    tcg_insn_unit i1, i2;
-    TCGType rt = TCG_TYPE_I64;
-    TCGReg  rd = TCG_REG_TMP;
-    uint64_t pair;
-
-    ptrdiff_t offset = addr - jmp_rx;
-
-    if (offset == sextract64(offset, 0, 26)) {
-        i1 = I3206_B | ((offset >> 2) & 0x3ffffff);
-        i2 = NOP;
-    } else {
-        offset = (addr >> 12) - (jmp_rx >> 12);
-
-        /* patch ADRP */
-        i1 = I3406_ADRP | (offset & 3) << 29 | (offset & 0x1ffffc) << (5 - 2) | rd;
-        /* patch ADDI */
-        i2 = I3401_ADDI | rt << 31 | (addr & 0xfff) << 10 | rd << 5 | rd;
-    }
-    pair = (uint64_t)i2 << 32 | i1;
-    qatomic_set((uint64_t *)jmp_rw, pair);
-    flush_idcache_range(jmp_rx, jmp_rw, 8);
-}
-
 static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
 {
     if (!l->has_value) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
     /*
-     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
-     * write can be used to patch the target address.
+     * Direct branch, or indirect address load, will be patched
+     * by tb_target_set_jmp_target.  Assert indirect load offset
+     * in range early, regardless of direct branch distance.
      */
-    if ((uintptr_t)s->code_ptr & 7) {
-        tcg_out32(s, NOP);
-    }
+    intptr_t i_off = tcg_pcrel_diff(s, (void *)get_jmp_target_addr(s, which));
+    tcg_debug_assert(i_off == sextract64(i_off, 0, 21));
+
     set_jmp_insn_offset(s, which);
-    /*
-     * actual branch destination will be patched by
-     * tb_target_set_jmp_target later
-     */
-    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
-    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
+    tcg_out32(s, I3206_B);
     tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    uintptr_t d_addr = tb->jmp_target_addr[n];
+    ptrdiff_t d_offset = d_addr - jmp_rx;
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or indirect branch load. */
+    if (d_offset == sextract64(d_offset, 0, 28)) {
+        insn = deposit32(I3206_B, 0, 26, d_offset >> 2);
+    } else {
+        uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
+        ptrdiff_t i_offset = i_addr - jmp_rx;
+
+        /* Note that we asserted this in range in tcg_out_goto_tb. */
+        insn = deposit32(I3305_LDR | TCG_REG_TMP, 0, 5, i_offset >> 2);
+    }
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
-- 
2.34.1

The old ppc64 implementation replaces 2 or 4 insns, which leaves a race
condition in which a thread could be stopped at a PC in the middle of
the sequence, and when restarted does not see the complete address
computation and branches to nowhere.

The new implemetation replaces only one insn, swapping between

b       <dest>
and
	mtctr	r31

falling through to a general-case indirect branch.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.h     |   3 +-
 tcg/ppc/tcg-target.c.inc | 158 +++++++++++----------------------------
 2 files changed, 44 insertions(+), 117 deletions(-)

diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 
 #ifdef _ARCH_PPC64
 # define TCG_TARGET_REG_BITS  64
-# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
 #else
 # define TCG_TARGET_REG_BITS  32
-# define MAX_CODE_GEN_BUFFER_SIZE  (32 * MiB)
 #endif
+#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 
 #define TCG_TARGET_NB_REGS 64
 #define TCG_TARGET_INSN_UNIT_SIZE 4
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
     tcg_out32(s, insn);
 }
 
-static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2)
-{
-    if (HOST_BIG_ENDIAN) {
-        return (uint64_t)i1 << 32 | i2;
-    }
-    return (uint64_t)i2 << 32 | i1;
-}
-
-static inline void ppc64_replace2(uintptr_t rx, uintptr_t rw,
-                                  tcg_insn_unit i0, tcg_insn_unit i1)
-{
-#if TCG_TARGET_REG_BITS == 64
-    qatomic_set((uint64_t *)rw, make_pair(i0, i1));
-    flush_idcache_range(rx, rw, 8);
-#else
-    qemu_build_not_reached();
-#endif
-}
-
-static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
-                                  tcg_insn_unit i0, tcg_insn_unit i1,
-                                  tcg_insn_unit i2, tcg_insn_unit i3)
-{
-    uint64_t p[2];
-
-    p[!HOST_BIG_ENDIAN] = make_pair(i0, i1);
-    p[HOST_BIG_ENDIAN] = make_pair(i2, i3);
-
-    /*
-     * There's no convenient way to get the compiler to allocate a pair
-     * of registers at an even index, so copy into r6/r7 and clobber.
-     */
-    asm("mr  %%r6, %1\n\t"
-        "mr  %%r7, %2\n\t"
-        "stq %%r6, %0"
-        : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7");
-    flush_idcache_range(rx, rw, 16);
-}
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-{
-    tcg_insn_unit i0, i1, i2, i3;
-    uintptr_t addr = tb->jmp_target_addr[n];
-    intptr_t tb_diff = addr - (uintptr_t)tb->tc.ptr;
-    intptr_t br_diff = addr - (jmp_rx + 4);
-    intptr_t lo, hi;
-
-    if (TCG_TARGET_REG_BITS == 32) {
-        intptr_t diff = addr - jmp_rx;
-        tcg_debug_assert(in_range_b(diff));
-        qatomic_set((uint32_t *)jmp_rw, B | (diff & 0x3fffffc));
-        flush_idcache_range(jmp_rx, jmp_rw, 4);
-        return;
-    }
-
-    /*
-     * For 16-bit displacements, we can use a single add + branch.
-     * This happens quite often.
-     */
-    if (tb_diff == (int16_t)tb_diff) {
-        i0 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
-        i1 = B | (br_diff & 0x3fffffc);
-        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
-        return;
-    }
-
-    lo = (int16_t)tb_diff;
-    hi = (int32_t)(tb_diff - lo);
-    assert(tb_diff == hi + lo);
-    i0 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
-    i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
-
-    /*
-     * Without stq from 2.07, we can only update two insns,
-     * and those must be the ones that load the target address.
-     */
-    if (!have_isa_2_07) {
-        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
-        return;
-    }
-
-    /*
-     * For 26-bit displacements, we can use a direct branch.
-     * Otherwise we still need the indirect branch, which we
-     * must restore after a potential direct branch write.
-     */
-    br_diff -= 4;
-    if (in_range_b(br_diff)) {
-        i2 = B | (br_diff & 0x3fffffc);
-        i3 = NOP;
-    } else {
-        i2 = MTSPR | RS(TCG_REG_TB) | CTR;
-        i3 = BCCTR | BO_ALWAYS;
-    }
-    ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3);
-}
-
 static void tcg_out_call_int(TCGContext *s, int lk,
                              const tcg_insn_unit *target)
 {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    /* Direct jump. */
-    if (TCG_TARGET_REG_BITS == 64) {
-        /* Ensure the next insns are 8 or 16-byte aligned. */
-        while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
-            tcg_out32(s, NOP);
-        }
+    uintptr_t ptr = get_jmp_target_addr(s, which);
+
+    if (USE_REG_TB) {
+        ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
+        tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
+    
+        /* Direct branch will be patched by tb_target_set_jmp_target. */
         set_jmp_insn_offset(s, which);
-        tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
+
+        /* When branch is out of range, fall through to indirect. */
+        tcg_out32(s, BCCTR | BO_ALWAYS);
+
+        /* For the unlinked case, need to reset TCG_REG_TB.  */
+        set_jmp_reset_offset(s, which);
+        tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
+                         -tcg_current_code_size(s));
+    } else {
+        /* Direct branch will be patched by tb_target_set_jmp_target. */
+        set_jmp_insn_offset(s, which);
+        tcg_out32(s, NOP);
+
+        /* When branch is out of range, fall through to indirect. */
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
+        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
+        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
         tcg_out32(s, BCCTR | BO_ALWAYS);
         set_jmp_reset_offset(s, which);
-        if (USE_REG_TB) {
-            /* For the unlinked case, need to reset TCG_REG_TB.  */
-            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
-                             -tcg_current_code_size(s));
-        }
-    } else {
-        set_jmp_insn_offset(s, which);
-        tcg_out32(s, B);
-        set_jmp_reset_offset(s, which);
     }
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t diff = addr - jmp_rx;
+    tcg_insn_unit insn;
+
+    if (in_range_b(diff)) {
+        insn = B | (diff & 0x3fffffc);
+    } else if (USE_REG_TB) {
+        insn = MTSPR | RS(TCG_REG_TB) | CTR;
+    } else {
+        insn = NOP;
+    }
+
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
-- 
2.34.1

This is always true for sparc64, so this is dead since 3a5f6805c7ca.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc64/tcg-target.c.inc | 62 ++++++++++++------------------------
 1 file changed, 21 insertions(+), 41 deletions(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #endif
 
 #define TCG_REG_TB  TCG_REG_I1
-#define USE_REG_TB  (sizeof(void *) > 4)
 
 static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_L0,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
     }
 
     /* A 13-bit constant relative to the TB.  */
-    if (!in_prologue && USE_REG_TB) {
+    if (!in_prologue) {
         test = tcg_tbrel_diff(s, (void *)arg);
         if (check_fit_ptr(test, 13)) {
             tcg_out_arithi(s, ret, TCG_REG_TB, test, ARITH_ADD);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
     }
 
     /* Use the constant pool, if possible. */
-    if (!in_prologue && USE_REG_TB) {
+    if (!in_prologue) {
         new_pool_label(s, arg, R_SPARC_13, s->code_ptr,
                        tcg_tbrel_diff(s, NULL));
         tcg_out32(s, LDX | INSN_RD(ret) | INSN_RS1(TCG_REG_TB));
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
 #endif
 
     /* We choose TCG_REG_TB such that no move is required.  */
-    if (USE_REG_TB) {
-        QEMU_BUILD_BUG_ON(TCG_REG_TB != TCG_REG_I1);
-        tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);
-    }
+    QEMU_BUILD_BUG_ON(TCG_REG_TB != TCG_REG_I1);
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);
 
     tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I1, 0, JMPL);
     /* delay slot */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
         tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
         tcg_out_movi_imm13(s, TCG_REG_O0, a0);
         return;
-    } else if (USE_REG_TB) {
+    } else {
         intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
         if (check_fit_ptr(tb_diff, 13)) {
             tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
+    int c;
+
     /* Direct jump. */
-    if (USE_REG_TB) {
-        /* make sure the patch is 8-byte aligned.  */
-        if ((intptr_t)s->code_ptr & 4) {
-            tcg_out_nop(s);
-        }
-        set_jmp_insn_offset(s, which);
-        tcg_out_sethi(s, TCG_REG_T1, 0);
-        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-        tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-    } else {
-        set_jmp_insn_offset(s, which);
-        tcg_out32(s, CALL);
+    /* make sure the patch is 8-byte aligned.  */
+    if ((intptr_t)s->code_ptr & 4) {
         tcg_out_nop(s);
     }
+    set_jmp_insn_offset(s, which);
+    tcg_out_sethi(s, TCG_REG_T1, 0);
+    tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+    tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+    tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
     set_jmp_reset_offset(s, which);
 
     /*
      * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
      * to the beginning of this TB.
      */
-    if (USE_REG_TB) {
-        int c = -tcg_current_code_size(s);
-        if (check_fit_i32(c, 13)) {
-            tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
-            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-        }
+    c = -tcg_current_code_size(s);
+    if (check_fit_i32(c, 13)) {
+        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
+        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     switch (opc) {
     case INDEX_op_goto_ptr:
         tcg_out_arithi(s, TCG_REG_G0, a0, 0, JMPL);
-        if (USE_REG_TB) {
-            tcg_out_mov_delay(s, TCG_REG_TB, a0);
-        } else {
-            tcg_out_nop(s);
-        }
+        tcg_out_mov_delay(s, TCG_REG_TB, a0);
         break;
     case INDEX_op_br:
         tcg_out_bpcc(s, COND_A, BPCC_PT, arg_label(a0));
@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
     tcg_debug_assert(tb_disp == (int32_t)tb_disp);
     tcg_debug_assert(br_disp == (int32_t)br_disp);
 
-    if (!USE_REG_TB) {
-        qatomic_set((uint32_t *)jmp_rw,
-		    deposit32(CALL, 0, 30, br_disp >> 2));
-        flush_idcache_range(jmp_rx, jmp_rw, 4);
-        return;
-    }
-
     /* This does not exercise the range of the branch, but we do
        still need to be able to load the new value of TCG_REG_TB.
        But this does still happen quite often.  */
-- 
2.34.1

The old sparc64 implementation may replace two insns, which leaves
a race condition in which a thread could be stopped at a PC in the
middle of the sequence, and when restarted does not see the complete
address computation and branches to nowhere.

The new implemetation replaces only one insn, swapping between a
direct branch and a direct call.  The TCG_REG_TB register is loaded
from tb->jmp_target_addr[] in the delay slot.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc64/tcg-target.c.inc | 87 +++++++++++++++---------------------
 1 file changed, 37 insertions(+), 50 deletions(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    int c;
+    ptrdiff_t off = tcg_tbrel_diff(s, (void *)get_jmp_target_addr(s, which));
 
-    /* Direct jump. */
-    /* make sure the patch is 8-byte aligned.  */
-    if ((intptr_t)s->code_ptr & 4) {
-        tcg_out_nop(s);
-    }
+    /* Direct branch will be patched by tb_target_set_jmp_target. */
     set_jmp_insn_offset(s, which);
-    tcg_out_sethi(s, TCG_REG_T1, 0);
-    tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-    tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-    tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+    tcg_out32(s, CALL);
+    /* delay slot */
+    tcg_debug_assert(check_fit_ptr(off, 13));
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, TCG_REG_TB, off);
     set_jmp_reset_offset(s, which);
 
     /*
      * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
      * to the beginning of this TB.
      */
-    c = -tcg_current_code_size(s);
-    if (check_fit_i32(c, 13)) {
-        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
+    off = -tcg_current_code_size(s);
+    if (check_fit_i32(off, 13)) {
+        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, off, ARITH_ADD);
     } else {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, off);
         tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
     }
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t br_disp = (intptr_t)(addr - jmp_rx) >> 2;
+    tcg_insn_unit insn;
+
+    br_disp >>= 2;
+    if (check_fit_ptr(br_disp, 19)) {
+        /* ba,pt %icc, addr */
+        insn = deposit32(INSN_OP(0) | INSN_OP2(1) | INSN_COND(COND_A)
+                         | BPCC_ICC | BPCC_PT, 0, 19, br_disp);
+    } else if (check_fit_ptr(br_disp, 22)) {
+        /* ba addr */
+        insn = deposit32(INSN_OP(0) | INSN_OP2(2) | INSN_COND(COND_A),
+                         0, 22, br_disp);
+    } else {
+        /* The code_gen_buffer can't be larger than 2GB.  */
+        tcg_debug_assert(check_fit_ptr(br_disp, 30));
+        /* call addr */
+        insn = deposit32(CALL, 0, 30, br_disp);
+    }
+
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ void tcg_register_jit(const void *buf, size_t buf_size)
 {
     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
 }
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-{
-    uintptr_t addr = tb->jmp_target_addr[n];
-    intptr_t tb_disp = addr - (uintptr_t)tb->tc.ptr;
-    intptr_t br_disp = addr - jmp_rx;
-    tcg_insn_unit i1, i2;
-
-    /* We can reach the entire address space for ILP32.
-       For LP64, the code_gen_buffer can't be larger than 2GB.  */
-    tcg_debug_assert(tb_disp == (int32_t)tb_disp);
-    tcg_debug_assert(br_disp == (int32_t)br_disp);
-
-    /* This does not exercise the range of the branch, but we do
-       still need to be able to load the new value of TCG_REG_TB.
-       But this does still happen quite often.  */
-    if (check_fit_ptr(tb_disp, 13)) {
-        /* ba,pt %icc, addr */
-        i1 = (INSN_OP(0) | INSN_OP2(1) | INSN_COND(COND_A)
-              | BPCC_ICC | BPCC_PT | INSN_OFF19(br_disp));
-        i2 = (ARITH_ADD | INSN_RD(TCG_REG_TB) | INSN_RS1(TCG_REG_TB)
-              | INSN_IMM13(tb_disp));
-    } else if (tb_disp >= 0) {
-        i1 = SETHI | INSN_RD(TCG_REG_T1) | ((tb_disp & 0xfffffc00) >> 10);
-        i2 = (ARITH_OR | INSN_RD(TCG_REG_T1) | INSN_RS1(TCG_REG_T1)
-              | INSN_IMM13(tb_disp & 0x3ff));
-    } else {
-        i1 = SETHI | INSN_RD(TCG_REG_T1) | ((~tb_disp & 0xfffffc00) >> 10);
-        i2 = (ARITH_XOR | INSN_RD(TCG_REG_T1) | INSN_RS1(TCG_REG_T1)
-              | INSN_IMM13((tb_disp & 0x3ff) | -0x400));
-    }
-
-    qatomic_set((uint64_t *)jmp_rw, deposit64(i2, 32, 32, i1));
-    flush_idcache_range(jmp_rx, jmp_rw, 8);
-}
-- 
2.34.1

Now that tcg can handle direct and indirect goto_tb
simultaneously, we can optimistically leave space for
a direct branch and fall back to loading the pointer
from the TB for an indirect branch.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 52 ++++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     ARITH_BIC = 0xe << 21,
     ARITH_MVN = 0xf << 21,
 
+    INSN_B         = 0x0a000000,
+
     INSN_CLZ       = 0x016f0f10,
     INSN_RBIT      = 0x06ff0f30,
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 
 static void tcg_out_b_imm(TCGContext *s, ARMCond cond, int32_t offset)
 {
-    tcg_out32(s, (cond << 28) | 0x0a000000 |
+    tcg_out32(s, (cond << 28) | INSN_B |
                     (((offset - 8) >> 2) & 0x00ffffff));
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    /* Indirect jump method */
-    intptr_t ptr, dif, dil;
-    TCGReg base = TCG_REG_PC;
+    uintptr_t i_addr;
+    intptr_t i_disp;
 
-    ptr = get_jmp_target_addr(s, which);
-    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
-    dil = sextract32(dif, 0, 12);
-    if (dif != dil) {
+    /* Direct branch will be patched by tb_target_set_jmp_target. */
+    set_jmp_insn_offset(s, which);
+    tcg_out32(s, INSN_NOP);
+
+    /* When branch is out of range, fall through to indirect. */
+    i_addr = get_jmp_target_addr(s, which);
+    i_disp = tcg_pcrel_diff(s, (void *)i_addr) - 8;
+    tcg_debug_assert(i_disp < 0);
+    if (i_disp >= -0xfff) {
+        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, i_disp);
+    } else {
         /*
          * The TB is close, but outside the 12 bits addressable by
          * the load.  We can extend this to 20 bits with a sub of a
-         * shifted immediate from pc.  In the vastly unlikely event
-         * the code requires more than 1MB, we'll use 2 insns and
-         * be no worse off.
+         * shifted immediate from pc.
          */
-        base = TCG_REG_R0;
-        tcg_out_movi32(s, COND_AL, base, ptr - dil);
+        int h = -i_disp;
+        int l = h & 0xfff;
+
+        h = encode_imm_nofail(h - l);
+        tcg_out_dat_imm(s, COND_AL, ARITH_SUB, TCG_REG_R0, TCG_REG_PC, h);
+        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, l);
     }
-    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
     set_jmp_reset_offset(s, which);
 }
 
 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
                               uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
-    /* Always indirect, nothing to do */
+    uintptr_t addr = tb->jmp_target_addr[n];
+    ptrdiff_t offset = addr - (jmp_rx + 8);
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or fall through to indirect branch. */
+    if (offset == sextract64(offset, 0, 26)) {
+        /* B <addr> */
+        insn = deposit32((COND_AL << 28) | INSN_B, 0, 24, offset >> 2);
+    } else {
+        insn = INSN_NOP;
+    }
+
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-- 
2.34.1

Now that tcg can handle direct and indirect goto_tb simultaneously,
we can optimistically leave space for a direct branch and fall back
to loading the pointer from the TB for an indirect branch.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/riscv/tcg-target.c.inc | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    /* indirect jump method */
+    /* Direct branch will be patched by tb_target_set_jmp_target. */
+    set_jmp_insn_offset(s, which);
+    tcg_out32(s, OPC_JAL);
+
+    /* When branch is out of range, fall through to indirect. */
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
                get_jmp_target_addr(s, which));
     tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
                               uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
-    /* Always indirect, nothing to do */
+    uintptr_t addr = tb->jmp_target_addr[n];
+    ptrdiff_t offset = addr - jmp_rx;
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or fall through to indirect branch. */
+    if (offset == sextreg(offset, 0, 20)) {
+        insn = encode_uj(OPC_JAL, TCG_REG_ZERO, offset);
+    } else {
+        insn = OPC_NOP;
+    }
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-- 
2.34.1