Series comparison

-[PULL 00/20] tcg patch queue
+[PULL 00/22] tcg patch queue
-TCG patch queue, plus one target/sh4 patch that
+Second pull for this week, since this set is large enough by itself.
 Yoshinori Sato asked me to process.
 r~
-The following changes since commit efbf38d73e5dcc4d5f8b98c6e7a12be1f3b91745:
+The following changes since commit 7c9236d6d61f30583d5d860097d88dbf0fe487bf:
-  Merge tag 'for-upstream' of git://repo.or.cz/qemu/kevin into staging (2022-10-03 15:06:07 -0400)
+  Merge tag 'pull-tcg-20230116' of https://gitlab.com/rth7680/qemu into staging (2023-01-17 10:24:16 +0000)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20221004
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230117
-for you to fetch changes up to ab419fd8a035a65942de4e63effcd55ccbf1a9fe:
+for you to fetch changes up to 493c9b19a7fb7f387c4fcf57d3836504d5242bf5:
-  target/sh4: Fix TB_FLAG_UNALIGN (2022-10-04 12:33:05 -0700)
+  tcg/riscv: Implement direct branch for goto_tb (2023-01-17 22:36:17 +0000)
 ----------------------------------------------------------------
-Cache CPUClass for use in hot code paths.
+tcg: Fix race conditions in (most) goto_tb implementations
 Add CPUTLBEntryFull, probe_access_full, tlb_set_page_full.
 Add generic support for TARGET_TB_PCREL.
 tcg/ppc: Optimize 26-bit jumps using STQ for POWER 2.07
 target/sh4: Fix TB_FLAG_UNALIGN
 ----------------------------------------------------------------
-Alex Bennée (3):
+Richard Henderson (22):
-      cpu: cache CPUClass in CPUState for hot code paths
+      tcg: Split out tcg_out_exit_tb
-      hw/core/cpu-sysemu: used cached class in cpu_asidx_from_attrs
+      tcg/i386: Remove unused goto_tb code for indirect jump
-      cputlb: used cached CPUClass in our hot-paths
+      tcg/ppc: Remove unused goto_tb code for indirect jump
       tcg/sparc64: Remove unused goto_tb code for indirect jump
       tcg: Replace asserts on tcg_jmp_insn_offset
       tcg: Introduce set_jmp_insn_offset
       tcg: Introduce get_jmp_target_addr
       tcg: Split out tcg_out_goto_tb
       tcg: Rename TB_JMP_RESET_OFFSET_INVALID to TB_JMP_OFFSET_INVALID
       tcg: Add gen_tb to TCGContext
       tcg: Add TranslationBlock.jmp_insn_offset
       tcg: Change tb_target_set_jmp_target arguments
       tcg: Move tb_target_set_jmp_target declaration to tcg.h
       tcg: Always define tb_target_set_jmp_target
       tcg: Remove TCG_TARGET_HAS_direct_jump
       tcg/aarch64: Reorg goto_tb implementation
       tcg/ppc: Reorg goto_tb implementation
       tcg/sparc64: Remove USE_REG_TB
       tcg/sparc64: Reorg goto_tb implementation
       tcg/arm: Implement direct branch for goto_tb
       tcg/riscv: Introduce OPC_NOP
       tcg/riscv: Implement direct branch for goto_tb
-Leandro Lupori (1):
+ include/exec/exec-all.h          |   5 +-
-      tcg/ppc: Optimize 26-bit jumps
+ include/tcg/tcg.h                |  14 ++-
+ tcg/aarch64/tcg-target.h         |   6 +-
-Richard Henderson (16):
+ tcg/arm/tcg-target.h             |   5 -
-      accel/tcg: Rename CPUIOTLBEntry to CPUTLBEntryFull
+ tcg/i386/tcg-target.h            |   9 --
-      accel/tcg: Drop addr member from SavedIOTLB
+ tcg/loongarch64/tcg-target.h     |   3 -
-      accel/tcg: Suppress auto-invalidate in probe_access_internal
+ tcg/mips/tcg-target.h            |   5 -
-      accel/tcg: Introduce probe_access_full
+ tcg/ppc/tcg-target.h             |   7 +-
-      accel/tcg: Introduce tlb_set_page_full
+ tcg/riscv/tcg-target.h           |   4 -
-      include/exec: Introduce TARGET_PAGE_ENTRY_EXTRA
+ tcg/s390x/tcg-target.h           |  11 ---
-      accel/tcg: Remove PageDesc code_bitmap
+ tcg/sparc64/tcg-target.h         |   4 -
-      accel/tcg: Use bool for page_find_alloc
+ tcg/tci/tcg-target.h             |   4 -
-      accel/tcg: Use DisasContextBase in plugin_gen_tb_start
+ accel/tcg/cpu-exec.c             |  21 ++--
-      accel/tcg: Do not align tb->page_addr[0]
+ accel/tcg/translate-all.c        |  10 +-
-      accel/tcg: Inline tb_flush_jmp_cache
+ tcg/tcg-op.c                     |  14 +--
-      include/hw/core: Create struct CPUJumpCache
+ tcg/tcg.c                        |  42 +++++---
-      hw/core: Add CPUClass.get_pc
+ tcg/aarch64/tcg-target.c.inc     | 106 ++++++++++-----------
-      accel/tcg: Introduce tb_pc and log_pc
+ tcg/arm/tcg-target.c.inc         |  89 +++++++++++------
-      accel/tcg: Introduce TARGET_TB_PCREL
+ tcg/i386/tcg-target.c.inc        |  68 +++++++------
-      target/sh4: Fix TB_FLAG_UNALIGN
+ tcg/loongarch64/tcg-target.c.inc |  66 +++++++------
+ tcg/mips/tcg-target.c.inc        |  59 +++++++-----
- accel/tcg/internal.h                    |  10 ++
+ tcg/ppc/tcg-target.c.inc         | 193 ++++++++++++-------------------------
- accel/tcg/tb-hash.h                     |   1 +
+ tcg/riscv/tcg-target.c.inc       |  65 +++++++++----
- accel/tcg/tb-jmp-cache.h                |  65 ++++++++
+ tcg/s390x/tcg-target.c.inc       |  67 ++++++++-----
- include/exec/cpu-common.h               |   1 +
+ tcg/sparc64/tcg-target.c.inc     | 201 +++++++++++++++------------------------
- include/exec/cpu-defs.h                 |  48 ++++--
+ tcg/tci/tcg-target.c.inc         |  31 +++---
- include/exec/exec-all.h                 |  75 ++++++++-
+files changed, 528 insertions(+), 581 deletions(-)
  include/exec/plugin-gen.h               |   7 +-
  include/hw/core/cpu.h                   |  28 ++--
  include/qemu/typedefs.h                 |   2 +
  include/tcg/tcg.h                       |   2 +-
  target/sh4/cpu.h                        |  56 ++++---
  accel/stubs/tcg-stub.c                  |   4 +
  accel/tcg/cpu-exec.c                    |  80 +++++-----
  accel/tcg/cputlb.c                      | 259 ++++++++++++++++++--------------
  accel/tcg/plugin-gen.c                  |  22 +--
  accel/tcg/translate-all.c               | 214 ++++++++++++--------------
  accel/tcg/translator.c                  |   2 +-
  cpu.c                                   |   9 +-
  hw/core/cpu-common.c                    |   3 +-
  hw/core/cpu-sysemu.c                    |   5 +-
  linux-user/sh4/signal.c                 |   6 +-
  plugins/core.c                          |   2 +-
  target/alpha/cpu.c                      |   9 ++
  target/arm/cpu.c                        |  17 ++-
  target/arm/mte_helper.c                 |  14 +-
  target/arm/sve_helper.c                 |   4 +-
  target/arm/translate-a64.c              |   2 +-
  target/avr/cpu.c                        |  10 +-
  target/cris/cpu.c                       |   8 +
  target/hexagon/cpu.c                    |  10 +-
  target/hppa/cpu.c                       |  12 +-
  target/i386/cpu.c                       |   9 ++
  target/i386/tcg/tcg-cpu.c               |   2 +-
  target/loongarch/cpu.c                  |  11 +-
  target/m68k/cpu.c                       |   8 +
  target/microblaze/cpu.c                 |  10 +-
  target/mips/cpu.c                       |   8 +
  target/mips/tcg/exception.c             |   2 +-
  target/mips/tcg/sysemu/special_helper.c |   2 +-
  target/nios2/cpu.c                      |   9 ++
  target/openrisc/cpu.c                   |  10 +-
  target/ppc/cpu_init.c                   |   8 +
  target/riscv/cpu.c                      |  17 ++-
  target/rx/cpu.c                         |  10 +-
  target/s390x/cpu.c                      |   8 +
  target/s390x/tcg/mem_helper.c           |   4 -
  target/sh4/cpu.c                        |  18 ++-
  target/sh4/helper.c                     |   6 +-
  target/sh4/translate.c                  |  90 +++++------
  target/sparc/cpu.c                      |  10 +-
  target/tricore/cpu.c                    |  11 +-
  target/xtensa/cpu.c                     |   8 +
  tcg/tcg.c                               |   8 +-
  trace/control-target.c                  |   2 +-
  tcg/ppc/tcg-target.c.inc                | 119 +++++++++++----
 files changed, 915 insertions(+), 462 deletions(-)
  create mode 100644 accel/tcg/tb-jmp-cache.h

-[PULL 04/20] accel/tcg: Rename CPUIOTLBEntry to CPUTLBEntryFull
+[PULL 01/22] tcg: Split out tcg_out_exit_tb
-This structure will shortly contain more than just
+The INDEX_op_exit_tb opcode needs no register allocation.
-data for accessing MMIO.  Rename the 'addr' member
+Split out a dedicated helper function for it.
 to 'xlat_section' to more clearly indicate its purpose.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-defs.h    |  22 ++++----
+ tcg/tcg.c                        |  4 ++++
- accel/tcg/cputlb.c         | 102 +++++++++++++++++++------------------
+ tcg/aarch64/tcg-target.c.inc     | 22 ++++++++++--------
- target/arm/mte_helper.c    |  14 ++---
+ tcg/arm/tcg-target.c.inc         | 11 +++++----
- target/arm/sve_helper.c    |   4 +-
+ tcg/i386/tcg-target.c.inc        | 21 +++++++++--------
- target/arm/translate-a64.c |   2 +-
+ tcg/loongarch64/tcg-target.c.inc | 22 ++++++++++--------
-files changed, 73 insertions(+), 71 deletions(-)
+ tcg/mips/tcg-target.c.inc        | 33 +++++++++++++--------------
  tcg/ppc/tcg-target.c.inc         | 11 +++++----
  tcg/riscv/tcg-target.c.inc       | 22 ++++++++++--------
  tcg/s390x/tcg-target.c.inc       | 23 ++++++++++---------
  tcg/sparc64/tcg-target.c.inc     | 39 +++++++++++++++++---------------
  tcg/tci/tcg-target.c.inc         | 10 ++++----
 files changed, 121 insertions(+), 97 deletions(-)
-diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
+diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-defs.h
+--- a/tcg/tcg.c
-+++ b/include/exec/cpu-defs.h
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ typedef uint64_t target_ulong;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
- #  endif
+ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
- # endif
+ static void tcg_out_movi(TCGContext *s, TCGType type,
+                          TCGReg ret, tcg_target_long arg);
-+/* Minimalized TLB entry for use by TCG fast path. */
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
- typedef struct CPUTLBEntry {
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-     /* bit TARGET_LONG_BITS to TARGET_PAGE_BITS : virtual address
+                        const TCGArg args[TCG_MAX_OP_ARGS],
-        bit TARGET_PAGE_BITS-1..4  : Nonzero for accesses that should not
+                        const int const_args[TCG_MAX_OP_ARGS]);
-@@ -XXX,XX +XXX,XX @@ typedef struct CPUTLBEntry {
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
+         case INDEX_op_call:
- QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS));
+             tcg_reg_alloc_call(s, op);
+             break;
--/* The IOTLB is not accessed directly inline by generated TCG code,
++        case INDEX_op_exit_tb:
-- * so the CPUIOTLBEntry layout is not as critical as that of the
++            tcg_out_exit_tb(s, op->args[0]);
-- * CPUTLBEntry. (This is also why we don't want to combine the two
++            break;
-- * structs into one.)
+         case INDEX_op_dup2_vec:
-+/*
+             if (tcg_reg_alloc_dup2(s, op)) {
-+ * The full TLB entry, which is not accessed by generated TCG code,
+                 break;
-+ * so the layout is not as critical as that of CPUTLBEntry. This is
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
-+ * also why we don't want to combine the two structs.
+index XXXXXXX..XXXXXXX 100644
-  */
+--- a/tcg/aarch64/tcg-target.c.inc
--typedef struct CPUIOTLBEntry {
++++ b/tcg/aarch64/tcg-target.c.inc
-+typedef struct CPUTLBEntryFull {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
-     /*
--     * @addr contains:
+ static const tcg_insn_unit *tb_ret_addr;
-+     * @xlat_section contains:
-      *  - in the lower TARGET_PAGE_BITS, a physical section number
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
-      *  - with the lower TARGET_PAGE_BITS masked off, an offset which
++{
-      *    must be added to the virtual address to obtain:
++    /* Reuse the zeroing that exists for goto_ptr.  */
-@@ -XXX,XX +XXX,XX @@ typedef struct CPUIOTLBEntry {
++    if (a0 == 0) {
-      *       number is PHYS_SECTION_NOTDIRTY or PHYS_SECTION_ROM)
++        tcg_out_goto_long(s, tcg_code_gen_epilogue);
-      *     + the offset within the target MemoryRegion (otherwise)
++    } else {
-      */
++        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
--    hwaddr addr;
++        tcg_out_goto_long(s, tb_ret_addr);
-+    hwaddr xlat_section;
++    }
-     MemTxAttrs attrs;
++}
--} CPUIOTLBEntry;
++
-+} CPUTLBEntryFull;
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
- /*
+                        const int const_args[TCG_MAX_OP_ARGS])
-  * Data elements that are per MMU mode, minus the bits accessed by
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-@@ -XXX,XX +XXX,XX @@ typedef struct CPUTLBDesc {
+ #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
-     size_t vindex;
-     /* The tlb victim table, in two parts.  */
+     switch (opc) {
-     CPUTLBEntry vtable[CPU_VTLB_SIZE];
+-    case INDEX_op_exit_tb:
--    CPUIOTLBEntry viotlb[CPU_VTLB_SIZE];
+-        /* Reuse the zeroing that exists for goto_ptr.  */
--    /* The iotlb.  */
+-        if (a0 == 0) {
--    CPUIOTLBEntry *iotlb;
+-            tcg_out_goto_long(s, tcg_code_gen_epilogue);
-+    CPUTLBEntryFull vfulltlb[CPU_VTLB_SIZE];
+-        } else {
-+    CPUTLBEntryFull *fulltlb;
+-            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
- } CPUTLBDesc;
+-            tcg_out_goto_long(s, tb_ret_addr);
+-        }
- /*
+-        break;
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+-
-index XXXXXXX..XXXXXXX 100644
+     case INDEX_op_goto_tb:
---- a/accel/tcg/cputlb.c
+         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
-+++ b/accel/tcg/cputlb.c
+         /*
-@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-     }
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
-     g_free(fast->table);
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
--    g_free(desc->iotlb);
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
-+    g_free(desc->fulltlb);
+     default:
+         g_assert_not_reached();
-     tlb_window_reset(desc, now, 0);
+     }
-     /* desc->n_used_entries is cleared by the caller */
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
-     fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
+index XXXXXXX..XXXXXXX 100644
-     fast->table = g_try_new(CPUTLBEntry, new_size);
+--- a/tcg/arm/tcg-target.c.inc
--    desc->iotlb = g_try_new(CPUIOTLBEntry, new_size);
++++ b/tcg/arm/tcg-target.c.inc
-+    desc->fulltlb = g_try_new(CPUTLBEntryFull, new_size);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
-     /*
+ static void tcg_out_epilogue(TCGContext *s);
-      * If the allocations fail, try smaller sizes. We just freed some
-@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast,
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
-      * allocations to fail though, so we progressively reduce the allocation
++{
-      * size, aborting if we cannot even allocate the smallest TLB we support.
++    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, arg);
-      */
++    tcg_out_epilogue(s);
--    while (fast->table == NULL || desc->iotlb == NULL) {
++}
-+    while (fast->table == NULL || desc->fulltlb == NULL) {
++
-         if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) {
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-             error_report("%s: %s", __func__, strerror(errno));
+                        const TCGArg args[TCG_MAX_OP_ARGS],
-             abort();
+                        const int const_args[TCG_MAX_OP_ARGS])
-@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-         fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
+     int c;
-         g_free(fast->table);
+     switch (opc) {
--        g_free(desc->iotlb);
+-    case INDEX_op_exit_tb:
-+        g_free(desc->fulltlb);
+-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]);
-         fast->table = g_try_new(CPUTLBEntry, new_size);
+-        tcg_out_epilogue(s);
--        desc->iotlb = g_try_new(CPUIOTLBEntry, new_size);
+-        break;
-+        desc->fulltlb = g_try_new(CPUTLBEntryFull, new_size);
+     case INDEX_op_goto_tb:
-     }
+         {
- }
+             /* Indirect jump method */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_init(CPUTLBDesc *desc, CPUTLBDescFast *fast, int64_t now)
-     desc->n_used_entries = 0;
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
-     fast->mask = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
-     fast->table = g_new(CPUTLBEntry, n_entries);
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
--    desc->iotlb = g_new(CPUIOTLBEntry, n_entries);
+     default:
-+    desc->fulltlb = g_new(CPUTLBEntryFull, n_entries);
+         tcg_abort();
-     tlb_mmu_flush_locked(desc, fast);
+     }
- }
+diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
-@@ -XXX,XX +XXX,XX @@ void tlb_destroy(CPUState *cpu)
+--- a/tcg/i386/tcg-target.c.inc
-         CPUTLBDescFast *fast = &env_tlb(env)->f[i];
++++ b/tcg/i386/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
          g_free(fast->table);
 -        g_free(desc->iotlb);
 +        g_free(desc->fulltlb);
      }
  }
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
          /* Evict the old entry into the victim tlb.  */
          copy_tlb_helper_locked(tv, te);
 -        desc->viotlb[vidx] = desc->iotlb[index];
 +        desc->vfulltlb[vidx] = desc->fulltlb[index];
          tlb_n_used_entries_dec(env, mmu_idx);
      }
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
       * subtract here is that of the page base, and not the same as the
       * vaddr we add back in io_readx()/io_writex()/get_page_addr_code().
       */
 -    desc->iotlb[index].addr = iotlb - vaddr_page;
 -    desc->iotlb[index].attrs = attrs;
 +    desc->fulltlb[index].xlat_section = iotlb - vaddr_page;
 +    desc->fulltlb[index].attrs = attrs;
      /* Now calculate the new entry */
      tn.addend = addend - vaddr_page;
@@ -XXX,XX +XXX,XX @@ static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
      }
  }
 -static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
 +static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
                           int mmu_idx, target_ulong addr, uintptr_t retaddr,
                           MMUAccessType access_type, MemOp op)
  {
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
      bool locked = false;
      MemTxResult r;
 -    section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
 +    section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
      mr = section->mr;
 -    mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
 +    mr_offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
      cpu->mem_io_pc = retaddr;
      if (!cpu->can_do_io) {
          cpu_io_recompile(cpu, retaddr);
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
          qemu_mutex_lock_iothread();
          locked = true;
      }
 -    r = memory_region_dispatch_read(mr, mr_offset, &val, op, iotlbentry->attrs);
 +    r = memory_region_dispatch_read(mr, mr_offset, &val, op, full->attrs);
      if (r != MEMTX_OK) {
          hwaddr physaddr = mr_offset +
              section->offset_within_address_space -
              section->offset_within_region;
          cpu_transaction_failed(cpu, physaddr, addr, memop_size(op), access_type,
 -                               mmu_idx, iotlbentry->attrs, r, retaddr);
 +                               mmu_idx, full->attrs, r, retaddr);
      }
      if (locked) {
          qemu_mutex_unlock_iothread();
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
  }
  /*
 - * Save a potentially trashed IOTLB entry for later lookup by plugin.
 - * This is read by tlb_plugin_lookup if the iotlb entry doesn't match
 + * Save a potentially trashed CPUTLBEntryFull for later lookup by plugin.
 + * This is read by tlb_plugin_lookup if the fulltlb entry doesn't match
   * because of the side effect of io_writex changing memory layout.
   */
  static void save_iotlb_data(CPUState *cs, hwaddr addr,
@@ -XXX,XX +XXX,XX @@ static void save_iotlb_data(CPUState *cs, hwaddr addr,
  #endif
  }
--static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
-+static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
++{
-                       int mmu_idx, uint64_t val, target_ulong addr,
++    /* Reuse the zeroing that exists for goto_ptr.  */
-                       uintptr_t retaddr, MemOp op)
++    if (a0 == 0) {
- {
++        tcg_out_jmp(s, tcg_code_gen_epilogue);
-@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
++    } else {
-     bool locked = false;
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
-     MemTxResult r;
++        tcg_out_jmp(s, tb_ret_addr);
++    }
--    section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
++}
-+    section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
++
-     mr = section->mr;
+ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
--    mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+                               const TCGArg args[TCG_MAX_OP_ARGS],
-+    mr_offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
+                               const int const_args[TCG_MAX_OP_ARGS])
-     if (!cpu->can_do_io) {
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
-         cpu_io_recompile(cpu, retaddr);
+     const_a2 = const_args[2];
-     }
-@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+     switch (opc) {
-      * The memory_region_dispatch may trigger a flush/resize
+-    case INDEX_op_exit_tb:
-      * so for plugins we save the iotlb_data just in case.
+-        /* Reuse the zeroing that exists for goto_ptr.  */
-      */
+-        if (a0 == 0) {
--    save_iotlb_data(cpu, iotlbentry->addr, section, mr_offset);
+-            tcg_out_jmp(s, tcg_code_gen_epilogue);
-+    save_iotlb_data(cpu, full->xlat_section, section, mr_offset);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
-     if (!qemu_mutex_iothread_locked()) {
+-            tcg_out_jmp(s, tb_ret_addr);
-         qemu_mutex_lock_iothread();
+-        }
-         locked = true;
+-        break;
-     }
+     case INDEX_op_goto_tb:
--    r = memory_region_dispatch_write(mr, mr_offset, val, op, iotlbentry->attrs);
+         if (s->tb_jmp_insn_offset) {
-+    r = memory_region_dispatch_write(mr, mr_offset, val, op, full->attrs);
+             /* direct jump method */
-     if (r != MEMTX_OK) {
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
-         hwaddr physaddr = mr_offset +
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
-             section->offset_within_address_space -
+     case INDEX_op_mov_i64:
-             section->offset_within_region;
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
-         cpu_transaction_failed(cpu, physaddr, addr, memop_size(op),
+     default:
--                               MMU_DATA_STORE, mmu_idx, iotlbentry->attrs, r,
+         tcg_abort();
-+                               MMU_DATA_STORE, mmu_idx, full->attrs, r,
+     }
-                                retaddr);
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
-     }
+index XXXXXXX..XXXXXXX 100644
-     if (locked) {
+--- a/tcg/loongarch64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
++++ b/tcg/loongarch64/tcg-target.c.inc
-             copy_tlb_helper_locked(vtlb, &tmptlb);
+@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-             qemu_spin_unlock(&env_tlb(env)->c.lock);
+ static const tcg_insn_unit *tb_ret_addr;
--            CPUIOTLBEntry tmpio, *io = &env_tlb(env)->d[mmu_idx].iotlb[index];
--            CPUIOTLBEntry *vio = &env_tlb(env)->d[mmu_idx].viotlb[vidx];
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
--            tmpio = *io; *io = *vio; *vio = tmpio;
++{
-+            CPUTLBEntryFull *f1 = &env_tlb(env)->d[mmu_idx].fulltlb[index];
++    /* Reuse the zeroing that exists for goto_ptr.  */
-+            CPUTLBEntryFull *f2 = &env_tlb(env)->d[mmu_idx].vfulltlb[vidx];
++    if (a0 == 0) {
-+            CPUTLBEntryFull tmpf;
++        tcg_out_call_int(s, tcg_code_gen_epilogue, true);
-+            tmpf = *f1; *f1 = *f2; *f2 = tmpf;
++    } else {
-             return true;
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
-         }
++        tcg_out_call_int(s, tb_ret_addr, true);
-     }
++    }
-@@ -XXX,XX +XXX,XX @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
++}
-                  (ADDR) & TARGET_PAGE_MASK)
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
- static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
--                           CPUIOTLBEntry *iotlbentry, uintptr_t retaddr)
+                        const int const_args[TCG_MAX_OP_ARGS])
-+                           CPUTLBEntryFull *full, uintptr_t retaddr)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
- {
+     int c2 = const_args[2];
--    ram_addr_t ram_addr = mem_vaddr + iotlbentry->addr;
-+    ram_addr_t ram_addr = mem_vaddr + full->xlat_section;
+     switch (opc) {
+-    case INDEX_op_exit_tb:
-     trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
+-        /* Reuse the zeroing that exists for goto_ptr.  */
+-        if (a0 == 0) {
-@@ -XXX,XX +XXX,XX @@ int probe_access_flags(CPUArchState *env, target_ulong addr,
+-            tcg_out_call_int(s, tcg_code_gen_epilogue, true);
-     /* Handle clean RAM pages.  */
+-        } else {
-     if (unlikely(flags & TLB_NOTDIRTY)) {
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
-         uintptr_t index = tlb_index(env, mmu_idx, addr);
+-            tcg_out_call_int(s, tb_ret_addr, true);
--        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+-        }
-+        CPUTLBEntryFull *full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+-        break;
+-
--        notdirty_write(env_cpu(env), addr, 1, iotlbentry, retaddr);
+     case INDEX_op_goto_tb:
-+        notdirty_write(env_cpu(env), addr, 1, full, retaddr);
+         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
-         flags &= ~TLB_NOTDIRTY;
+         /*
-     }
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
-@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
-     if (unlikely(flags & (TLB_NOTDIRTY | TLB_WATCHPOINT))) {
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
-         uintptr_t index = tlb_index(env, mmu_idx, addr);
+     default:
--        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+         g_assert_not_reached();
-+        CPUTLBEntryFull *full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+     }
+diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
-         /* Handle watchpoints.  */
+index XXXXXXX..XXXXXXX 100644
-         if (flags & TLB_WATCHPOINT) {
+--- a/tcg/mips/tcg-target.c.inc
-             int wp_access = (access_type == MMU_DATA_STORE
++++ b/tcg/mips/tcg-target.c.inc
-                              ? BP_MEM_WRITE : BP_MEM_READ);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_clz(TCGContext *s, MIPSInsn opcv2, MIPSInsn opcv6,
-             cpu_check_watchpoint(env_cpu(env), addr, size,
+     }
--                                 iotlbentry->attrs, wp_access, retaddr);
+ }
-+                                 full->attrs, wp_access, retaddr);
-         }
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
-         /* Handle clean RAM pages.  */
++    TCGReg b0 = TCG_REG_ZERO;
-         if (flags & TLB_NOTDIRTY) {
++
--            notdirty_write(env_cpu(env), addr, 1, iotlbentry, retaddr);
++    if (a0 & ~0xffff) {
-+            notdirty_write(env_cpu(env), addr, 1, full, retaddr);
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, a0 & ~0xffff);
-         }
++        b0 = TCG_REG_V0;
-     }
++    }
++    if (!tcg_out_opc_jmp(s, OPC_J, tb_ret_addr)) {
-@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, (uintptr_t)tb_ret_addr);
-  * should have just filled the TLB. The one corner case is io_writex
++        tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
-  * which can cause TLB flushes and potential resizing of the TLBs
++    }
-  * losing the information we need. In those cases we need to recover
++    tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
-- * data from a copy of the iotlbentry. As long as this always occurs
++}
-+ * data from a copy of the CPUTLBEntryFull. As long as this always occurs
++
-  * from the same thread (which a mem callback will be) this is safe.
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-  */
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
-@@ -XXX,XX +XXX,XX @@ bool tlb_plugin_lookup(CPUState *cpu, target_ulong addr, int mmu_idx,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-     if (likely(tlb_hit(tlb_addr, addr))) {
+     c2 = const_args[2];
-         /* We must have an iotlb entry for MMIO */
-         if (tlb_addr & TLB_MMIO) {
+     switch (opc) {
--            CPUIOTLBEntry *iotlbentry;
+-    case INDEX_op_exit_tb:
--            iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+-        {
-+            CPUTLBEntryFull *full;
+-            TCGReg b0 = TCG_REG_ZERO;
-+            full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+-
-             data->is_io = true;
+-            a0 = (intptr_t)a0;
--            data->v.io.section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
+-            if (a0 & ~0xffff) {
--            data->v.io.offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, a0 & ~0xffff);
-+            data->v.io.section =
+-                b0 = TCG_REG_V0;
-+                iotlb_to_section(cpu, full->xlat_section, full->attrs);
+-            }
-+            data->v.io.offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
+-            if (!tcg_out_opc_jmp(s, OPC_J, tb_ret_addr)) {
-         } else {
+-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0,
-             data->is_io = false;
+-                             (uintptr_t)tb_ret_addr);
-             data->v.ram.hostaddr = (void *)((uintptr_t)addr + tlbe->addend);
+-                tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
-@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
+-            }
+-            tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
-     if (unlikely(tlb_addr & TLB_NOTDIRTY)) {
+-        }
-         notdirty_write(env_cpu(env), addr, size,
+-        break;
--                       &env_tlb(env)->d[mmu_idx].iotlb[index], retaddr);
+     case INDEX_op_goto_tb:
-+                       &env_tlb(env)->d[mmu_idx].fulltlb[index], retaddr);
+         /* indirect jump method */
-     }
+         tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-     return hostaddr;
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
-@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, MemOpIdx oi,
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
-     /* Handle anything that isn't just a straight memory access.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
-     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
+     default:
--        CPUIOTLBEntry *iotlbentry;
+         tcg_abort();
-+        CPUTLBEntryFull *full;
+     }
-         bool need_swap;
+diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
-         /* For anything that is unaligned, recurse through full_load.  */
+--- a/tcg/ppc/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, MemOpIdx oi,
++++ b/tcg/ppc/tcg-target.c.inc
-             goto do_unaligned_access;
+@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
-         }
+     tcg_out32(s, BCLR | BO_ALWAYS);
+ }
--        iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
-+        full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
++{
-         /* Handle watchpoints.  */
++    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, arg);
-         if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
++    tcg_out_b(s, 0, tcg_code_gen_epilogue);
-             /* On watchpoint hit, this will longjmp out.  */
++}
-             cpu_check_watchpoint(env_cpu(env), addr, size,
++
--                                 iotlbentry->attrs, BP_MEM_READ, retaddr);
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-+                                 full->attrs, BP_MEM_READ, retaddr);
+                        const TCGArg args[TCG_MAX_OP_ARGS],
-         }
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-         need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
+     TCGArg a0, a1, a2;
-         /* Handle I/O access.  */
+     switch (opc) {
-         if (likely(tlb_addr & TLB_MMIO)) {
+-    case INDEX_op_exit_tb:
--            return io_readx(env, iotlbentry, mmu_idx, addr, retaddr,
+-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, args[0]);
-+            return io_readx(env, full, mmu_idx, addr, retaddr,
+-        tcg_out_b(s, 0, tcg_code_gen_epilogue);
-                             access_type, op ^ (need_swap * MO_BSWAP));
+-        break;
-         }
+     case INDEX_op_goto_tb:
+         if (s->tb_jmp_insn_offset) {
-@@ -XXX,XX +XXX,XX @@ store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
+             /* Direct jump. */
-      */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-     if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
+     case INDEX_op_mov_i32:   /* Always emitted via tcg_out_mov.  */
-         cpu_check_watchpoint(env_cpu(env), addr, size - size2,
+     case INDEX_op_mov_i64:
--                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
+     case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
-+                             env_tlb(env)->d[mmu_idx].fulltlb[index].attrs,
++    case INDEX_op_exit_tb:   /* Always emitted via tcg_out_exit_tb.  */
-                              BP_MEM_WRITE, retaddr);
+     default:
-     }
+         tcg_abort();
-     if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
+     }
-         cpu_check_watchpoint(env_cpu(env), page2, size2,
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
--                             env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
+index XXXXXXX..XXXXXXX 100644
-+                             env_tlb(env)->d[mmu_idx].fulltlb[index2].attrs,
+--- a/tcg/riscv/tcg-target.c.inc
-                              BP_MEM_WRITE, retaddr);
++++ b/tcg/riscv/tcg-target.c.inc
-     }
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
+ static const tcg_insn_unit *tb_ret_addr;
-     /* Handle anything that isn't just a straight memory access.  */
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
-     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
++{
--        CPUIOTLBEntry *iotlbentry;
++    /* Reuse the zeroing that exists for goto_ptr.  */
-+        CPUTLBEntryFull *full;
++    if (a0 == 0) {
-         bool need_swap;
++        tcg_out_call_int(s, tcg_code_gen_epilogue, true);
++    } else {
-         /* For anything that is unaligned, recurse through byte stores.  */
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
++        tcg_out_call_int(s, tb_ret_addr, true);
-             goto do_unaligned_access;
++    }
-         }
++}
++
--        iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-+        full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
-         /* Handle watchpoints.  */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-         if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
+     int c2 = const_args[2];
-             /* On watchpoint hit, this will longjmp out.  */
-             cpu_check_watchpoint(env_cpu(env), addr, size,
+     switch (opc) {
--                                 iotlbentry->attrs, BP_MEM_WRITE, retaddr);
+-    case INDEX_op_exit_tb:
-+                                 full->attrs, BP_MEM_WRITE, retaddr);
+-        /* Reuse the zeroing that exists for goto_ptr.  */
-         }
+-        if (a0 == 0) {
+-            tcg_out_call_int(s, tcg_code_gen_epilogue, true);
-         need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
-         /* Handle I/O access.  */
+-            tcg_out_call_int(s, tb_ret_addr, true);
-         if (tlb_addr & TLB_MMIO) {
+-        }
--            io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr,
+-        break;
-+            io_writex(env, full, mmu_idx, val, addr, retaddr,
+-
-                       op ^ (need_swap * MO_BSWAP));
+     case INDEX_op_goto_tb:
-             return;
+         assert(s->tb_jmp_insn_offset == 0);
-         }
+         /* indirect jump method */
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
-         /* Handle clean RAM pages.  */
+     case INDEX_op_mov_i64:
-         if (tlb_addr & TLB_NOTDIRTY) {
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
--            notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
-+            notdirty_write(env_cpu(env), addr, size, full, retaddr);
+     default:
-         }
+         g_assert_not_reached();
+     }
-         haddr = (void *)((uintptr_t)addr + entry->addend);
+diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
-diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c
+index XXXXXXX..XXXXXXX 100644
-index XXXXXXX..XXXXXXX 100644
+--- a/tcg/s390x/tcg-target.c.inc
---- a/target/arm/mte_helper.c
++++ b/tcg/s390x/tcg-target.c.inc
-+++ b/target/arm/mte_helper.c
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
@@ -XXX,XX +XXX,XX @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
      return tags + index;
  #else
      uintptr_t index;
 -    CPUIOTLBEntry *iotlbentry;
 +    CPUTLBEntryFull *full;
      int in_page, flags;
      ram_addr_t ptr_ra;
      hwaddr ptr_paddr, tag_paddr, xlat;
@@ -XXX,XX +XXX,XX @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
      assert(!(flags & TLB_INVALID_MASK));
      /*
 -     * Find the iotlbentry for ptr.  This *must* be present in the TLB
 +     * Find the CPUTLBEntryFull for ptr.  This *must* be present in the TLB
       * because we just found the mapping.
       * TODO: Perhaps there should be a cputlb helper that returns a
       * matching tlb entry + iotlb entry.
@@ -XXX,XX +XXX,XX @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
          g_assert(tlb_hit(comparator, ptr));
      }
  # endif
 -    iotlbentry = &env_tlb(env)->d[ptr_mmu_idx].iotlb[index];
 +    full = &env_tlb(env)->d[ptr_mmu_idx].fulltlb[index];
      /* If the virtual page MemAttr != Tagged, access unchecked. */
 -    if (!arm_tlb_mte_tagged(&iotlbentry->attrs)) {
 +    if (!arm_tlb_mte_tagged(&full->attrs)) {
          return NULL;
      }
@@ -XXX,XX +XXX,XX @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
          int wp = ptr_access == MMU_DATA_LOAD ? BP_MEM_READ : BP_MEM_WRITE;
          assert(ra != 0);
          cpu_check_watchpoint(env_cpu(env), ptr, ptr_size,
 -                             iotlbentry->attrs, wp, ra);
 +                             full->attrs, wp, ra);
      }
      /*
@@ -XXX,XX +XXX,XX @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
      tag_paddr = ptr_paddr >> (LOG2_TAG_GRANULE + 1);
      /* Look up the address in tag space. */
 -    tag_asi = iotlbentry->attrs.secure ? ARMASIdx_TagS : ARMASIdx_TagNS;
 +    tag_asi = full->attrs.secure ? ARMASIdx_TagS : ARMASIdx_TagNS;
      tag_as = cpu_get_address_space(env_cpu(env), tag_asi);
      mr = address_space_translate(tag_as, tag_paddr, &xlat, NULL,
                                   tag_access == MMU_DATA_STORE,
 -                                 iotlbentry->attrs);
 +                                 full->attrs);
      /*
       * Note that @mr will never be NULL.  If there is nothing in the address
 diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/sve_helper.c
 +++ b/target/arm/sve_helper.c
@@ -XXX,XX +XXX,XX @@ bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
          g_assert(tlb_hit(comparator, addr));
  # endif
 -        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
 -        info->attrs = iotlbentry->attrs;
 +        CPUTLBEntryFull *full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
 +        info->attrs = full->attrs;
      }
  #endif
 diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate-a64.c
 +++ b/target/arm/translate-a64.c
@@ -XXX,XX +XXX,XX @@ static bool is_guarded_page(CPUARMState *env, DisasContext *s)
       * table entry even for that case.
       */
      return (tlb_hit(entry->addr_code, addr) &&
 -            arm_tlb_bti_gp(&env_tlb(env)->d[mmu_idx].iotlb[index].attrs));
 +            arm_tlb_bti_gp(&env_tlb(env)->d[mmu_idx].fulltlb[index].attrs));
  #endif
  }
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    /* Reuse the zeroing that exists for goto_ptr.  */
++    if (a0 == 0) {
++        tgen_gotoi(s, S390_CC_ALWAYS, tcg_code_gen_epilogue);
++    } else {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, a0);
++        tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
++    }
++}
++
+ # define OP_32_64(x) \
+         case glue(glue(INDEX_op_,x),_i32): \
+         case glue(glue(INDEX_op_,x),_i64)
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     TCGArg a0, a1, a2;
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        /* Reuse the zeroing that exists for goto_ptr.  */
+-        a0 = args[0];
+-        if (a0 == 0) {
+-            tgen_gotoi(s, S390_CC_ALWAYS, tcg_code_gen_epilogue);
+-        } else {
+-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, a0);
+-            tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
+-        }
+-        break;
+-
+     case INDEX_op_goto_tb:
+         a0 = args[0];
+         /*
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc64/tcg-target.c.inc
++++ b/tcg/sparc64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
+ #endif /* CONFIG_SOFTMMU */
+ }
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
++{
++    if (check_fit_ptr(a0, 13)) {
++        tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
++        tcg_out_movi_imm13(s, TCG_REG_O0, a0);
++        return;
++    } else if (USE_REG_TB) {
++        intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
++        if (check_fit_ptr(tb_diff, 13)) {
++            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
++            /* Note that TCG_REG_TB has been unwound to O1.  */
++            tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O1, tb_diff, ARITH_ADD);
++            return;
++        }
++    }
++    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I0, a0 & ~0x3ff);
++    tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
++    tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     c2 = const_args[2];
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        if (check_fit_ptr(a0, 13)) {
+-            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+-            tcg_out_movi_imm13(s, TCG_REG_O0, a0);
+-            break;
+-        } else if (USE_REG_TB) {
+-            intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
+-            if (check_fit_ptr(tb_diff, 13)) {
+-                tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+-                /* Note that TCG_REG_TB has been unwound to O1.  */
+-                tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O1, tb_diff, ARITH_ADD);
+-                break;
+-            }
+-        }
+-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I0, a0 & ~0x3ff);
+-        tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+-        tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
+-        break;
+     case INDEX_op_goto_tb:
+         if (s->tb_jmp_insn_offset) {
+             /* direct jump method */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.c.inc
++++ b/tcg/tci/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *func,
+ # define CASE_64(x)
+ #endif
++static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
++{
++    tcg_out_op_p(s, INDEX_op_exit_tb, (void *)arg);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     TCGOpcode exts;
+     switch (opc) {
+-    case INDEX_op_exit_tb:
+-        tcg_out_op_p(s, opc, (void *)args[0]);
+-        break;
+-
+     case INDEX_op_goto_tb:
+         tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+         /* indirect jump method. */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_mov_i64:
+     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+     default:
+         tcg_abort();
+     }
 --
 .34.1

-New patch
+[PULL 02/22] tcg/i386: Remove unused goto_tb code for indirect jump
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/i386/tcg-target.c.inc | 14 +++++---------
+file changed, 5 insertions(+), 9 deletions(-)
+diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.c.inc
++++ b/tcg/i386/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        if (s->tb_jmp_insn_offset) {
+-            /* direct jump method */
+-            int gap;
+-            /* jump displacement must be aligned for atomic patching;
++        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
++        {
++            /*
++             * Jump displacement must be aligned for atomic patching;
+              * see if we need to add extra nops before jump
+              */
+-            gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
++            int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
+             if (gap != 1) {
+                 tcg_out_nopn(s, gap - 1);
+             }
+             tcg_out8(s, OPC_JMP_long); /* jmp im */
+             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+             tcg_out32(s, 0);
+-        } else {
+-            /* indirect jump method */
+-            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
+-                                 (intptr_t)(s->tb_jmp_target_addr + a0));
+         }
+         set_jmp_reset_offset(s, a0);
+         break;
+--
+.34.1

-New patch
+[PULL 03/22] tcg/ppc: Remove unused goto_tb code for indirect jump
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/ppc/tcg-target.c.inc | 32 +++++++++++++-------------------
+file changed, 13 insertions(+), 19 deletions(-)
+diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.c.inc
++++ b/tcg/ppc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     switch (opc) {
+     case INDEX_op_goto_tb:
+-        if (s->tb_jmp_insn_offset) {
+-            /* Direct jump. */
+-            if (TCG_TARGET_REG_BITS == 64) {
+-                /* Ensure the next insns are 8 or 16-byte aligned. */
+-                while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
+-                    tcg_out32(s, NOP);
+-                }
+-                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+-                tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+-                tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+-            } else {
+-                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+-                tcg_out32(s, B);
+-                s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+-                break;
++        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
++        /* Direct jump. */
++        if (TCG_TARGET_REG_BITS == 64) {
++            /* Ensure the next insns are 8 or 16-byte aligned. */
++            while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
++                tcg_out32(s, NOP);
+             }
++            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
++            tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
++            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+         } else {
+-            /* Indirect jump. */
+-            tcg_debug_assert(s->tb_jmp_insn_offset == NULL);
+-            tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, 0,
+-                       (intptr_t)(s->tb_jmp_insn_offset + args[0]));
++            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
++            tcg_out32(s, B);
++            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
++            break;
+         }
+         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
+         tcg_out32(s, BCCTR | BO_ALWAYS);
+--
+.34.1

-[PULL 12/20] accel/tcg: Use DisasContextBase in plugin_gen_tb_start
+[PULL 04/22] tcg/sparc64: Remove unused goto_tb code for indirect jump
-Use the pc coming from db->pc_first rather than the TB.
-Use the cached host_addr rather than re-computing for the
-first page.  We still need a separate lookup for the second
-page because it won't be computed for DisasContextBase until
-the translator actually performs a read from the page.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/plugin-gen.h |  7 ++++---
+ tcg/sparc64/tcg-target.c.inc | 41 +++++++++++-------------------------
- accel/tcg/plugin-gen.c    | 22 +++++++++++-----------
+file changed, 12 insertions(+), 29 deletions(-)
  accel/tcg/translator.c    |  2 +-
 files changed, 16 insertions(+), 15 deletions(-)
-diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/plugin-gen.h
+--- a/tcg/sparc64/tcg-target.c.inc
-+++ b/include/exec/plugin-gen.h
++++ b/tcg/sparc64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ struct DisasContextBase;
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
  #ifdef CONFIG_PLUGIN
 -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress);
 +bool plugin_gen_tb_start(CPUState *cpu, const struct DisasContextBase *db,
 +                         bool supress);
  void plugin_gen_tb_end(CPUState *cpu);
  void plugin_gen_insn_start(CPUState *cpu, const struct DisasContextBase *db);
  void plugin_gen_insn_end(void);
@@ -XXX,XX +XXX,XX @@ static inline void plugin_insn_append(abi_ptr pc, const void *from, size_t size)
  #else /* !CONFIG_PLUGIN */
 -static inline
 -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress)
 +static inline bool
 +plugin_gen_tb_start(CPUState *cpu, const struct DisasContextBase *db, bool sup)
  {
      return false;
  }
-diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
-index XXXXXXX..XXXXXXX 100644
+-static void tcg_out_ld_ptr(TCGContext *s, TCGReg ret, const void *arg)
---- a/accel/tcg/plugin-gen.c
+-{
-+++ b/accel/tcg/plugin-gen.c
+-    intptr_t diff = tcg_tbrel_diff(s, arg);
-@@ -XXX,XX +XXX,XX @@ static void plugin_gen_inject(const struct qemu_plugin_tb *plugin_tb)
+-    if (USE_REG_TB && check_fit_ptr(diff, 13)) {
-     pr_ops();
+-        tcg_out_ld(s, TCG_TYPE_PTR, ret, TCG_REG_TB, diff);
- }
+-        return;
+-    }
--bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool mem_only)
+-    tcg_out_movi(s, TCG_TYPE_PTR, ret, (uintptr_t)arg & ~0x3ff);
-+bool plugin_gen_tb_start(CPUState *cpu, const DisasContextBase *db,
+-    tcg_out_ld(s, TCG_TYPE_PTR, ret, ret, (uintptr_t)arg & 0x3ff);
-+                         bool mem_only)
+-}
 -
  static void tcg_out_sety(TCGContext *s, TCGReg rs)
  {
-     bool ret = false;
+     tcg_out32(s, WRY | INSN_RS1(TCG_REG_G0) | INSN_RS2(rs));
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-@@ -XXX,XX +XXX,XX @@ bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool mem_onl
+     switch (opc) {
-         ret = true;
+     case INDEX_op_goto_tb:
+-        if (s->tb_jmp_insn_offset) {
--        ptb->vaddr = tb->pc;
+-            /* direct jump method */
-+        ptb->vaddr = db->pc_first;
+-            if (USE_REG_TB) {
-         ptb->vaddr2 = -1;
+-                /* make sure the patch is 8-byte aligned.  */
--        get_page_addr_code_hostp(cpu->env_ptr, tb->pc, &ptb->haddr1);
+-                if ((intptr_t)s->code_ptr & 4) {
-+        ptb->haddr1 = db->host_addr[0];
+-                    tcg_out_nop(s);
-         ptb->haddr2 = NULL;
+-                }
-         ptb->mem_only = mem_only;
+-                s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+-                tcg_out_sethi(s, TCG_REG_T1, 0);
-@@ -XXX,XX +XXX,XX @@ void plugin_gen_insn_start(CPUState *cpu, const DisasContextBase *db)
+-                tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-      * Note that we skip this when haddr1 == NULL, e.g. when we're
+-                tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-      * fetching instructions from a region not backed by RAM.
+-                tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-      */
+-            } else {
--    if (likely(ptb->haddr1 != NULL && ptb->vaddr2 == -1) &&
+-                s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
--        unlikely((db->pc_next & TARGET_PAGE_MASK) !=
+-                tcg_out32(s, CALL);
--                 (db->pc_first & TARGET_PAGE_MASK))) {
++        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
--        get_page_addr_code_hostp(cpu->env_ptr, db->pc_next,
++        /* Direct jump. */
--                                 &ptb->haddr2);
++        if (USE_REG_TB) {
--        ptb->vaddr2 = db->pc_next;
++            /* make sure the patch is 8-byte aligned.  */
--    }
++            if ((intptr_t)s->code_ptr & 4) {
--    if (likely(ptb->vaddr2 == -1)) {
+                 tcg_out_nop(s);
-+    if (ptb->haddr1 == NULL) {
+             }
-+        pinsn->haddr = NULL;
++            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
-+    } else if (is_same_page(db, db->pc_next)) {
++            tcg_out_sethi(s, TCG_REG_T1, 0);
-         pinsn->haddr = ptb->haddr1 + pinsn->vaddr - ptb->vaddr;
++            tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-     } else {
++            tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-+        if (ptb->vaddr2 == -1) {
++            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-+            ptb->vaddr2 = TARGET_PAGE_ALIGN(db->pc_first);
+         } else {
-+            get_page_addr_code_hostp(cpu->env_ptr, ptb->vaddr2, &ptb->haddr2);
+-            /* indirect jump method */
-+        }
+-            tcg_out_ld_ptr(s, TCG_REG_TB, s->tb_jmp_target_addr + a0);
-         pinsn->haddr = ptb->haddr2 + pinsn->vaddr - ptb->vaddr2;
+-            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_TB, 0, JMPL);
-     }
++            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
- }
++            tcg_out32(s, CALL);
-diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
+             tcg_out_nop(s);
-index XXXXXXX..XXXXXXX 100644
+         }
---- a/accel/tcg/translator.c
+         set_jmp_reset_offset(s, a0);
 +++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int max_insns,
      ops->tb_start(db, cpu);
      tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
 -    plugin_enabled = plugin_gen_tb_start(cpu, tb, cflags & CF_MEMI_ONLY);
 +    plugin_enabled = plugin_gen_tb_start(cpu, db, cflags & CF_MEMI_ONLY);
      while (true) {
          db->num_insns++;
 --
 .34.1

-[PULL 09/20] include/exec: Introduce TARGET_PAGE_ENTRY_EXTRA
+[PULL 05/22] tcg: Replace asserts on tcg_jmp_insn_offset
-Allow the target to cache items from the guest page tables.
+Test TCG_TARGET_HAS_direct_jump instead of testing an
 implementation pointer.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-defs.h | 9 +++++++++
+ tcg/aarch64/tcg-target.c.inc     | 2 +-
-file changed, 9 insertions(+)
+ tcg/arm/tcg-target.c.inc         | 2 +-
  tcg/loongarch64/tcg-target.c.inc | 2 +-
  tcg/mips/tcg-target.c.inc        | 2 +-
  tcg/riscv/tcg-target.c.inc       | 2 +-
  tcg/tci/tcg-target.c.inc         | 2 +-
 files changed, 6 insertions(+), 6 deletions(-)
-diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-defs.h
+--- a/tcg/aarch64/tcg-target.c.inc
-+++ b/include/exec/cpu-defs.h
++++ b/tcg/aarch64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ typedef struct CPUTLBEntryFull {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-     /* @lg_page_size contains the log2 of the page size. */
+     switch (opc) {
-     uint8_t lg_page_size;
+     case INDEX_op_goto_tb:
-+
+-        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
-+    /*
++        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
-+     * Allow target-specific additions to this structure.
+         /*
-+     * This may be used to cache items from the guest cpu
+          * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
-+     * page tables for later use by the implementation.
+          * write can be used to patch the target address.
-+     */
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
-+#ifdef TARGET_PAGE_ENTRY_EXTRA
+index XXXXXXX..XXXXXXX 100644
-+    TARGET_PAGE_ENTRY_EXTRA
+--- a/tcg/arm/tcg-target.c.inc
-+#endif
++++ b/tcg/arm/tcg-target.c.inc
- } CPUTLBEntryFull;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+             intptr_t ptr, dif, dil;
- /*
+             TCGReg base = TCG_REG_PC;
 -            tcg_debug_assert(s->tb_jmp_insn_offset == 0);
 +            qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
              ptr = (intptr_t)tcg_splitwx_to_rx(s->tb_jmp_target_addr + args[0]);
              dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
              dil = sextract32(dif, 0, 12);
 diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.c.inc
 +++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
 +        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
          /*
           * Ensure that patch area is 8-byte aligned so that an
           * atomic write can be used to patch the target address.
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
          /* indirect jump method */
 -        tcg_debug_assert(s->tb_jmp_insn_offset == 0);
 +        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
          tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
                     (uintptr_t)(s->tb_jmp_target_addr + a0));
          tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        assert(s->tb_jmp_insn_offset == 0);
 +        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
          /* indirect jump method */
          tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
                     (uintptr_t)(s->tb_jmp_target_addr + a0));
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        tcg_debug_assert(s->tb_jmp_insn_offset == 0);
 +        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
          /* indirect jump method. */
          tcg_out_op_p(s, opc, s->tb_jmp_target_addr + args[0]);
          set_jmp_reset_offset(s, args[0]);
 --
 .34.1

-[PULL 06/20] accel/tcg: Suppress auto-invalidate in probe_access_internal
+[PULL 06/22] tcg: Introduce set_jmp_insn_offset
-When PAGE_WRITE_INV is set when calling tlb_set_page,
+Similar to the existing set_jmp_reset_offset.  Move any assert for
-we immediately set TLB_INVALID_MASK in order to force
+TCG_TARGET_HAS_direct_jump into the new function (which now cannot
-tlb_fill to be called on the next lookup.  Here in
+be build-time).  Will be unused if TCG_TARGET_HAS_direct_jump is
-probe_access_internal, we have just called tlb_fill
+constant 0, but we can't test for constant in the preprocessor,
-and eliminated true misses, thus the lookup must be valid.
+so just mark it G_GNUC_UNUSED.
 This allows us to remove a warning comment from s390x.
 There doesn't seem to be a reason to change the code though.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c            | 10 +++++++++-
+ tcg/tcg.c                        | 10 ++++++++++
- target/s390x/tcg/mem_helper.c |  4 ----
+ tcg/aarch64/tcg-target.c.inc     |  3 +--
-files changed, 9 insertions(+), 5 deletions(-)
+ tcg/i386/tcg-target.c.inc        |  3 +--
  tcg/loongarch64/tcg-target.c.inc |  3 +--
  tcg/ppc/tcg-target.c.inc         |  7 +++----
  tcg/s390x/tcg-target.c.inc       |  2 +-
  tcg/sparc64/tcg-target.c.inc     |  5 ++---
 files changed, 19 insertions(+), 14 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/tcg/tcg.c
-+++ b/accel/tcg/cputlb.c
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
+@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
-     }
+     s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
-     tlb_addr = tlb_read_ofs(entry, elt_ofs);
+ }
-+    flags = TLB_FLAGS_MASK;
++static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
-     page_addr = addr & TARGET_PAGE_MASK;
++{
-     if (!tlb_hit_page(tlb_addr, page_addr)) {
++    /*
-         if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page_addr)) {
++     * We will check for overflow at the end of the opcode loop in
-@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
++     * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
++     */
-             /* TLB resize via tlb_fill may have moved the entry.  */
++    tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
-             entry = tlb_entry(env, mmu_idx, addr);
++    s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
 +}
 +
-+            /*
+ /* Signal overflow, starting over with fewer guest insns. */
-+             * With PAGE_WRITE_INV, we set TLB_INVALID_MASK immediately,
+ static G_NORETURN
-+             * to force the next access through tlb_fill.  We've just
+ void tcg_raise_tb_overflow(TCGContext *s)
-+             * called tlb_fill, so we know that this entry *is* valid.
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
-+             */
+index XXXXXXX..XXXXXXX 100644
-+            flags &= ~TLB_INVALID_MASK;
+--- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
          /*
           * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
           * write can be used to patch the target address.
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
          if ((uintptr_t)s->code_ptr & 7) {
              tcg_out32(s, NOP);
          }
-         tlb_addr = tlb_read_ofs(entry, elt_ofs);
+-        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
-     }
++        set_jmp_insn_offset(s, a0);
--    flags = tlb_addr & TLB_FLAGS_MASK;
+         /*
-+    flags &= tlb_addr;
+          * actual branch destination will be patched by
+          * tb_target_set_jmp_target later
-     /* Fold all "mmio-like" bits into TLB_MMIO.  This is not RAM.  */
+diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
      if (unlikely(flags & ~(TLB_WATCHPOINT | TLB_NOTDIRTY))) {
 diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/tcg/mem_helper.c
+--- a/tcg/i386/tcg-target.c.inc
-+++ b/target/s390x/tcg/mem_helper.c
++++ b/tcg/i386/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static int s390_probe_access(CPUArchState *env, target_ulong addr, int size,
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
- #else
-     int flags;
+     switch (opc) {
+     case INDEX_op_goto_tb:
--    /*
+-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
--     * For !CONFIG_USER_ONLY, we cannot rely on TLB_INVALID_MASK or haddr==NULL
+         {
--     * to detect if there was an exception during tlb_fill().
+             /*
--     */
+              * Jump displacement must be aligned for atomic patching;
-     env->tlb_fill_exc = 0;
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
-     flags = probe_access_flags(env, addr, access_type, mmu_idx, nonfault, phost,
+                 tcg_out_nopn(s, gap - 1);
-                                ra);
+             }
              tcg_out8(s, OPC_JMP_long); /* jmp im */
 -            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
 +            set_jmp_insn_offset(s, a0);
              tcg_out32(s, 0);
          }
          set_jmp_reset_offset(s, a0);
 diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.c.inc
 +++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
          /*
           * Ensure that patch area is 8-byte aligned so that an
           * atomic write can be used to patch the target address.
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
          if ((uintptr_t)s->code_ptr & 7) {
              tcg_out_nop(s);
          }
 -        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
 +        set_jmp_insn_offset(s, a0);
          /*
           * actual branch destination will be patched by
           * tb_target_set_jmp_target later
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
          /* Direct jump. */
          if (TCG_TARGET_REG_BITS == 64) {
              /* Ensure the next insns are 8 or 16-byte aligned. */
              while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
                  tcg_out32(s, NOP);
              }
 -            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
 +            set_jmp_insn_offset(s, args[0]);
              tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
              tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
          } else {
 -            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
 +            set_jmp_insn_offset(s, args[0]);
              tcg_out32(s, B);
 -            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
 +            set_jmp_reset_offset(s, args[0]);
              break;
          }
          tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
 diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.c.inc
 +++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
              tcg_out16(s, NOP);
          }
          tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
 -        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
 +        set_jmp_insn_offset(s, a0);
          s->code_ptr += 2;
          set_jmp_reset_offset(s, a0);
          break;
 diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.c.inc
 +++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      switch (opc) {
      case INDEX_op_goto_tb:
 -        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
          /* Direct jump. */
          if (USE_REG_TB) {
              /* make sure the patch is 8-byte aligned.  */
              if ((intptr_t)s->code_ptr & 4) {
                  tcg_out_nop(s);
              }
 -            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
 +            set_jmp_insn_offset(s, a0);
              tcg_out_sethi(s, TCG_REG_T1, 0);
              tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
              tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
              tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
          } else {
 -            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
 +            set_jmp_insn_offset(s, a0);
              tcg_out32(s, CALL);
              tcg_out_nop(s);
          }
 --
 .34.1

-[PULL 07/20] accel/tcg: Introduce probe_access_full
+[PULL 07/22] tcg: Introduce get_jmp_target_addr
-Add an interface to return the CPUTLBEntryFull struct
+Similar to the existing set_jmp_reset_offset.  Include the
-that goes with the lookup.  The result is not intended
+rw->rx address space conversion done by arm and s390x, and
-to be valid across multiple lookups, so the user must
+forgotten by mips and riscv.
 use the results immediately.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/exec-all.h | 15 +++++++++++++
+ tcg/tcg.c                  | 9 +++++++++
- include/qemu/typedefs.h |  1 +
+ tcg/arm/tcg-target.c.inc   | 2 +-
- accel/tcg/cputlb.c      | 47 +++++++++++++++++++++++++----------------
+ tcg/mips/tcg-target.c.inc  | 2 +-
-files changed, 45 insertions(+), 18 deletions(-)
+ tcg/riscv/tcg-target.c.inc | 2 +-
  tcg/tci/tcg-target.c.inc   | 2 +-
 files changed, 13 insertions(+), 4 deletions(-)
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
+--- a/tcg/tcg.c
-+++ b/include/exec/exec-all.h
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ int probe_access_flags(CPUArchState *env, target_ulong addr,
+@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
-                        MMUAccessType access_type, int mmu_idx,
+     s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
                         bool nonfault, void **phost, uintptr_t retaddr);
 +#ifndef CONFIG_USER_ONLY
 +/**
 + * probe_access_full:
 + * Like probe_access_flags, except also return into @pfull.
 + *
 + * The CPUTLBEntryFull structure returned via @pfull is transient
 + * and must be consumed or copied immediately, before any further
 + * access or changes to TLB @mmu_idx.
 + */
 +int probe_access_full(CPUArchState *env, target_ulong addr,
 +                      MMUAccessType access_type, int mmu_idx,
 +                      bool nonfault, void **phost,
 +                      CPUTLBEntryFull **pfull, uintptr_t retaddr);
 +#endif
 +
  #define CODE_GEN_ALIGN           16 /* must be >= of the size of a icache line */
  /* Estimated block size for TB allocation.  */
 diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/typedefs.h
 +++ b/include/qemu/typedefs.h
@@ -XXX,XX +XXX,XX @@ typedef struct ConfidentialGuestSupport ConfidentialGuestSupport;
  typedef struct CPUAddressSpace CPUAddressSpace;
  typedef struct CPUArchState CPUArchState;
  typedef struct CPUState CPUState;
 +typedef struct CPUTLBEntryFull CPUTLBEntryFull;
  typedef struct DeviceListener DeviceListener;
  typedef struct DeviceState DeviceState;
  typedef struct DirtyBitmapSnapshot DirtyBitmapSnapshot;
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
  static int probe_access_internal(CPUArchState *env, target_ulong addr,
                                   int fault_size, MMUAccessType access_type,
                                   int mmu_idx, bool nonfault,
 -                                 void **phost, uintptr_t retaddr)
 +                                 void **phost, CPUTLBEntryFull **pfull,
 +                                 uintptr_t retaddr)
  {
      uintptr_t index = tlb_index(env, mmu_idx, addr);
      CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
                                             mmu_idx, nonfault, retaddr)) {
                  /* Non-faulting page table read failed.  */
                  *phost = NULL;
 +                *pfull = NULL;
                  return TLB_INVALID_MASK;
              }
              /* TLB resize via tlb_fill may have moved the entry.  */
 +            index = tlb_index(env, mmu_idx, addr);
              entry = tlb_entry(env, mmu_idx, addr);
              /*
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
      }
      flags &= tlb_addr;
 +    *pfull = &env_tlb(env)->d[mmu_idx].fulltlb[index];
 +
      /* Fold all "mmio-like" bits into TLB_MMIO.  This is not RAM.  */
      if (unlikely(flags & ~(TLB_WATCHPOINT | TLB_NOTDIRTY))) {
          *phost = NULL;
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
      return flags;
  }
--int probe_access_flags(CPUArchState *env, target_ulong addr,
++static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
 -                       MMUAccessType access_type, int mmu_idx,
 -                       bool nonfault, void **phost, uintptr_t retaddr)
 +int probe_access_full(CPUArchState *env, target_ulong addr,
 +                      MMUAccessType access_type, int mmu_idx,
 +                      bool nonfault, void **phost, CPUTLBEntryFull **pfull,
 +                      uintptr_t retaddr)
  {
 -    int flags;
 -
 -    flags = probe_access_internal(env, addr, 0, access_type, mmu_idx,
 -                                  nonfault, phost, retaddr);
 +    int flags = probe_access_internal(env, addr, 0, access_type, mmu_idx,
 +                                      nonfault, phost, pfull, retaddr);
      /* Handle clean RAM pages.  */
      if (unlikely(flags & TLB_NOTDIRTY)) {
 -        uintptr_t index = tlb_index(env, mmu_idx, addr);
 -        CPUTLBEntryFull *full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
 -
 -        notdirty_write(env_cpu(env), addr, 1, full, retaddr);
 +        notdirty_write(env_cpu(env), addr, 1, *pfull, retaddr);
          flags &= ~TLB_NOTDIRTY;
      }
      return flags;
  }
 +int probe_access_flags(CPUArchState *env, target_ulong addr,
 +                       MMUAccessType access_type, int mmu_idx,
 +                       bool nonfault, void **phost, uintptr_t retaddr)
 +{
-+    CPUTLBEntryFull *full;
++    /*
-+
++     * Return the read-execute version of the pointer, for the benefit
-+    return probe_access_full(env, addr, access_type, mmu_idx,
++     * of any pc-relative addressing mode.
-+                             nonfault, phost, &full, retaddr);
++     */
 +    return (uintptr_t)tcg_splitwx_to_rx(&s->tb_jmp_target_addr[which]);
 +}
 +
- void *probe_access(CPUArchState *env, target_ulong addr, int size,
+ /* Signal overflow, starting over with fewer guest insns. */
-                    MMUAccessType access_type, int mmu_idx, uintptr_t retaddr)
+ static G_NORETURN
- {
+ void tcg_raise_tb_overflow(TCGContext *s)
-+    CPUTLBEntryFull *full;
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
-     void *host;
+index XXXXXXX..XXXXXXX 100644
-     int flags;
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
-     g_assert(-(addr | TARGET_PAGE_MASK) >= size);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+             TCGReg base = TCG_REG_PC;
-     flags = probe_access_internal(env, addr, size, access_type, mmu_idx,
--                                  false, &host, retaddr);
+             qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-+                                  false, &host, &full, retaddr);
+-            ptr = (intptr_t)tcg_splitwx_to_rx(s->tb_jmp_target_addr + args[0]);
++            ptr = get_jmp_target_addr(s, args[0]);
-     /* Per the interface, size == 0 merely faults the access. */
+             dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
-     if (size == 0) {
+             dil = sextract32(dif, 0, 12);
-@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
+             if (dif != dil) {
-     }
+diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
-     if (unlikely(flags & (TLB_NOTDIRTY | TLB_WATCHPOINT))) {
+--- a/tcg/mips/tcg-target.c.inc
--        uintptr_t index = tlb_index(env, mmu_idx, addr);
++++ b/tcg/mips/tcg-target.c.inc
--        CPUTLBEntryFull *full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
--
+         /* indirect jump method */
-         /* Handle watchpoints.  */
+         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-         if (flags & TLB_WATCHPOINT) {
+         tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
-             int wp_access = (access_type == MMU_DATA_STORE
+-                   (uintptr_t)(s->tb_jmp_target_addr + a0));
-@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
++                   get_jmp_target_addr(s, a0));
- void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
+         tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
-                         MMUAccessType access_type, int mmu_idx)
+         tcg_out_nop(s);
- {
+         set_jmp_reset_offset(s, a0);
-+    CPUTLBEntryFull *full;
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
-     void *host;
+index XXXXXXX..XXXXXXX 100644
-     int flags;
+--- a/tcg/riscv/tcg-target.c.inc
++++ b/tcg/riscv/tcg-target.c.inc
-     flags = probe_access_internal(env, addr, 0, access_type,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
--                                  mmu_idx, true, &host, 0);
+         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-+                                  mmu_idx, true, &host, &full, 0);
+         /* indirect jump method */
+         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
-     /* No combination of flags are expected by the caller. */
+-                   (uintptr_t)(s->tb_jmp_target_addr + a0));
-     return flags ? NULL : host;
++                   get_jmp_target_addr(s, a0));
-@@ -XXX,XX +XXX,XX @@ void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
+         tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
- tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
+         set_jmp_reset_offset(s, a0);
-                                         void **hostp)
+         break;
- {
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
-+    CPUTLBEntryFull *full;
+index XXXXXXX..XXXXXXX 100644
-     void *p;
+--- a/tcg/tci/tcg-target.c.inc
++++ b/tcg/tci/tcg-target.c.inc
-     (void)probe_access_internal(env, addr, 1, MMU_INST_FETCH,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
--                                cpu_mmu_index(env, true), false, &p, 0);
+     case INDEX_op_goto_tb:
-+                                cpu_mmu_index(env, true), false, &p, &full, 0);
+         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-     if (p == NULL) {
+         /* indirect jump method. */
-         return -1;
+-        tcg_out_op_p(s, opc, s->tb_jmp_target_addr + args[0]);
-     }
++        tcg_out_op_p(s, opc, (void *)get_jmp_target_addr(s, args[0]));
          set_jmp_reset_offset(s, args[0]);
          break;
 --
 .34.1

-[PULL 02/20] hw/core/cpu-sysemu: used cached class in cpu_asidx_from_attrs
+[PULL 08/22] tcg: Split out tcg_out_goto_tb
-From: Alex Bennée <alex.bennee@linaro.org>
+The INDEX_op_goto_tb opcode needs no register allocation.
 Split out a dedicated helper function for it.
-This is a heavily used function so lets avoid the cost of
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-CPU_GET_CLASS. On the romulus-bmc run it has a modest effect:
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
   Before: 36.812 s ±  0.506 s
   After:  35.912 s ±  0.168 s
 Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20220811151413.3350684-4-alex.bennee@linaro.org>
 Signed-off-by: Cédric Le Goater <clg@kaod.org>
 Message-Id: <20220923084803.498337-4-clg@kaod.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- hw/core/cpu-sysemu.c | 5 ++---
+ tcg/tcg.c                        |  4 ++
-file changed, 2 insertions(+), 3 deletions(-)
+ tcg/aarch64/tcg-target.c.inc     | 40 ++++++++++---------
  tcg/arm/tcg-target.c.inc         | 49 ++++++++++++-----------
  tcg/i386/tcg-target.c.inc        | 33 ++++++++--------
  tcg/loongarch64/tcg-target.c.inc | 38 +++++++++---------
  tcg/mips/tcg-target.c.inc        | 21 +++++-----
  tcg/ppc/tcg-target.c.inc         | 52 ++++++++++++------------
  tcg/riscv/tcg-target.c.inc       | 20 +++++-----
  tcg/s390x/tcg-target.c.inc       | 31 ++++++++-------
  tcg/sparc64/tcg-target.c.inc     | 68 +++++++++++++++++---------------
  tcg/tci/tcg-target.c.inc         | 16 ++++----
 files changed, 199 insertions(+), 173 deletions(-)
-diff --git a/hw/core/cpu-sysemu.c b/hw/core/cpu-sysemu.c
+diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/core/cpu-sysemu.c
+--- a/tcg/tcg.c
-+++ b/hw/core/cpu-sysemu.c
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ hwaddr cpu_get_phys_page_debug(CPUState *cpu, vaddr addr)
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
+ static void tcg_out_movi(TCGContext *s, TCGType type,
- int cpu_asidx_from_attrs(CPUState *cpu, MemTxAttrs attrs)
+                          TCGReg ret, tcg_target_long arg);
- {
+ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
--    CPUClass *cc = CPU_GET_CLASS(cpu);
++static void tcg_out_goto_tb(TCGContext *s, int which);
-     int ret = 0;
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
--    if (cc->sysemu_ops->asidx_from_attrs) {
+                        const int const_args[TCG_MAX_OP_ARGS]);
--        ret = cc->sysemu_ops->asidx_from_attrs(cpu, attrs);
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
-+    if (cpu->cc->sysemu_ops->asidx_from_attrs) {
+         case INDEX_op_exit_tb:
-+        ret = cpu->cc->sysemu_ops->asidx_from_attrs(cpu, attrs);
+             tcg_out_exit_tb(s, op->args[0]);
-         assert(ret < cpu->num_ases && ret >= 0);
+             break;
-     }
++        case INDEX_op_goto_tb:
-     return ret;
++            tcg_out_goto_tb(s, op->args[0]);
 +            break;
          case INDEX_op_dup2_vec:
              if (tcg_reg_alloc_dup2(s, op)) {
                  break;
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      }
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /*
 +     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
 +     * write can be used to patch the target address.
 +     */
 +    if ((uintptr_t)s->code_ptr & 7) {
 +        tcg_out32(s, NOP);
 +    }
 +    set_jmp_insn_offset(s, which);
 +    /*
 +     * actual branch destination will be patched by
 +     * tb_target_set_jmp_target later
 +     */
 +    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
 +    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
 +    tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
  #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        /*
 -         * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
 -         * write can be used to patch the target address.
 -         */
 -        if ((uintptr_t)s->code_ptr & 7) {
 -            tcg_out32(s, NOP);
 -        }
 -        set_jmp_insn_offset(s, a0);
 -        /*
 -         * actual branch destination will be patched by
 -         * tb_target_set_jmp_target later
 -         */
 -        tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
 -        tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
 -        tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
 -        set_jmp_reset_offset(s, a0);
 -        break;
 -
      case INDEX_op_goto_ptr:
          tcg_out_insn(s, 3207, BR, a0);
          break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          g_assert_not_reached();
      }
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
      tcg_out_epilogue(s);
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /* Indirect jump method */
 +    intptr_t ptr, dif, dil;
 +    TCGReg base = TCG_REG_PC;
 +
 +    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 +    ptr = get_jmp_target_addr(s, which);
 +    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
 +    dil = sextract32(dif, 0, 12);
 +    if (dif != dil) {
 +        /*
 +         * The TB is close, but outside the 12 bits addressable by
 +         * the load.  We can extend this to 20 bits with a sub of a
 +         * shifted immediate from pc.  In the vastly unlikely event
 +         * the code requires more than 1MB, we'll use 2 insns and
 +         * be no worse off.
 +         */
 +        base = TCG_REG_R0;
 +        tcg_out_movi32(s, COND_AL, base, ptr - dil);
 +    }
 +    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      int c;
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        {
 -            /* Indirect jump method */
 -            intptr_t ptr, dif, dil;
 -            TCGReg base = TCG_REG_PC;
 -
 -            qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 -            ptr = get_jmp_target_addr(s, args[0]);
 -            dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
 -            dil = sextract32(dif, 0, 12);
 -            if (dif != dil) {
 -                /* The TB is close, but outside the 12 bits addressable by
 -                   the load.  We can extend this to 20 bits with a sub of a
 -                   shifted immediate from pc.  In the vastly unlikely event
 -                   the code requires more than 1MB, we'll use 2 insns and
 -                   be no worse off.  */
 -                base = TCG_REG_R0;
 -                tcg_out_movi32(s, COND_AL, base, ptr - dil);
 -            }
 -            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
 -            set_jmp_reset_offset(s, args[0]);
 -        }
 -        break;
      case INDEX_op_goto_ptr:
          tcg_out_b_reg(s, COND_AL, args[0]);
          break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      }
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /*
 +     * Jump displacement must be aligned for atomic patching;
 +     * see if we need to add extra nops before jump
 +     */
 +    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
 +    if (gap != 1) {
 +        tcg_out_nopn(s, gap - 1);
 +    }
 +    tcg_out8(s, OPC_JMP_long); /* jmp im */
 +    set_jmp_insn_offset(s, which);
 +    tcg_out32(s, 0);
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                                const TCGArg args[TCG_MAX_OP_ARGS],
                                const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
      const_a2 = const_args[2];
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        {
 -            /*
 -             * Jump displacement must be aligned for atomic patching;
 -             * see if we need to add extra nops before jump
 -             */
 -            int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
 -            if (gap != 1) {
 -                tcg_out_nopn(s, gap - 1);
 -            }
 -            tcg_out8(s, OPC_JMP_long); /* jmp im */
 -            set_jmp_insn_offset(s, a0);
 -            tcg_out32(s, 0);
 -        }
 -        set_jmp_reset_offset(s, a0);
 -        break;
      case INDEX_op_goto_ptr:
          /* jmp to the given host address (could be epilogue) */
          tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.c.inc
 +++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      }
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /*
 +     * Ensure that patch area is 8-byte aligned so that an
 +     * atomic write can be used to patch the target address.
 +     */
 +    if ((uintptr_t)s->code_ptr & 7) {
 +        tcg_out_nop(s);
 +    }
 +    set_jmp_insn_offset(s, which);
 +    /*
 +     * actual branch destination will be patched by
 +     * tb_target_set_jmp_target later
 +     */
 +    tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
 +    tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      int c2 = const_args[2];
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        /*
 -         * Ensure that patch area is 8-byte aligned so that an
 -         * atomic write can be used to patch the target address.
 -         */
 -        if ((uintptr_t)s->code_ptr & 7) {
 -            tcg_out_nop(s);
 -        }
 -        set_jmp_insn_offset(s, a0);
 -        /*
 -         * actual branch destination will be patched by
 -         * tb_target_set_jmp_target later
 -         */
 -        tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
 -        tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
 -        set_jmp_reset_offset(s, a0);
 -        break;
 -
      case INDEX_op_mb:
          tcg_out_mb(s, a0);
          break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          g_assert_not_reached();
      }
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /* indirect jump method */
 +    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 +    tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
 +               get_jmp_target_addr(s, which));
 +    tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
 +    tcg_out_nop(s);
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      c2 = const_args[2];
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        /* indirect jump method */
 -        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 -        tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
 -                   get_jmp_target_addr(s, a0));
 -        tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
 -        tcg_out_nop(s);
 -        set_jmp_reset_offset(s, a0);
 -        break;
      case INDEX_op_goto_ptr:
          /* jmp to the given host address (could be epilogue) */
          tcg_out_opc_reg(s, OPC_JR, 0, a0, 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
      tcg_out_b(s, 0, tcg_code_gen_epilogue);
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /* Direct jump. */
 +    if (TCG_TARGET_REG_BITS == 64) {
 +        /* Ensure the next insns are 8 or 16-byte aligned. */
 +        while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
 +            tcg_out32(s, NOP);
 +        }
 +        set_jmp_insn_offset(s, which);
 +        tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
 +        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
 +        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
 +        tcg_out32(s, BCCTR | BO_ALWAYS);
 +        set_jmp_reset_offset(s, which);
 +        if (USE_REG_TB) {
 +            /* For the unlinked case, need to reset TCG_REG_TB.  */
 +            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
 +                             -tcg_current_code_size(s));
 +        }
 +    } else {
 +        set_jmp_insn_offset(s, which);
 +        tcg_out32(s, B);
 +        set_jmp_reset_offset(s, which);
 +    }
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      TCGArg a0, a1, a2;
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        /* Direct jump. */
 -        if (TCG_TARGET_REG_BITS == 64) {
 -            /* Ensure the next insns are 8 or 16-byte aligned. */
 -            while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
 -                tcg_out32(s, NOP);
 -            }
 -            set_jmp_insn_offset(s, args[0]);
 -            tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
 -            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
 -        } else {
 -            set_jmp_insn_offset(s, args[0]);
 -            tcg_out32(s, B);
 -            set_jmp_reset_offset(s, args[0]);
 -            break;
 -        }
 -        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
 -        tcg_out32(s, BCCTR | BO_ALWAYS);
 -        set_jmp_reset_offset(s, args[0]);
 -        if (USE_REG_TB) {
 -            /* For the unlinked case, need to reset TCG_REG_TB.  */
 -            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
 -                             -tcg_current_code_size(s));
 -        }
 -        break;
      case INDEX_op_goto_ptr:
          tcg_out32(s, MTSPR | RS(args[0]) | CTR);
          if (USE_REG_TB) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:   /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:   /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      }
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 +    /* indirect jump method */
 +    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
 +               get_jmp_target_addr(s, which));
 +    tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      int c2 = const_args[2];
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 -        /* indirect jump method */
 -        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
 -                   get_jmp_target_addr(s, a0));
 -        tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
 -        set_jmp_reset_offset(s, a0);
 -        break;
 -
      case INDEX_op_goto_ptr:
          tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, a0, 0);
          break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          g_assert_not_reached();
      }
 diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.c.inc
 +++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      }
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /*
 +     * Branch displacement must be aligned for atomic patching;
 +     * see if we need to add extra nop before branch
 +     */
 +    if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
 +        tcg_out16(s, NOP);
 +    }
 +    tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
 +    set_jmp_insn_offset(s, which);
 +    s->code_ptr += 2;
 +    set_jmp_reset_offset(s, which);
 +}
 +
  # define OP_32_64(x) \
          case glue(glue(INDEX_op_,x),_i32): \
          case glue(glue(INDEX_op_,x),_i64)
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
      TCGArg a0, a1, a2;
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        a0 = args[0];
 -        /*
 -         * branch displacement must be aligned for atomic patching;
 -         * see if we need to add extra nop before branch
 -         */
 -        if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
 -            tcg_out16(s, NOP);
 -        }
 -        tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
 -        set_jmp_insn_offset(s, a0);
 -        s->code_ptr += 2;
 -        set_jmp_reset_offset(s, a0);
 -        break;
 -
      case INDEX_op_goto_ptr:
          a0 = args[0];
          tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, a0);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.c.inc
 +++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
      tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    /* Direct jump. */
 +    if (USE_REG_TB) {
 +        /* make sure the patch is 8-byte aligned.  */
 +        if ((intptr_t)s->code_ptr & 4) {
 +            tcg_out_nop(s);
 +        }
 +        set_jmp_insn_offset(s, which);
 +        tcg_out_sethi(s, TCG_REG_T1, 0);
 +        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
 +        tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
 +        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
 +    } else {
 +        set_jmp_insn_offset(s, which);
 +        tcg_out32(s, CALL);
 +        tcg_out_nop(s);
 +    }
 +    set_jmp_reset_offset(s, which);
 +
 +    /*
 +     * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
 +     * to the beginning of this TB.
 +     */
 +    if (USE_REG_TB) {
 +        int c = -tcg_current_code_size(s);
 +        if (check_fit_i32(c, 13)) {
 +            tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
 +        } else {
 +            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
 +            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
 +        }
 +    }
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      c2 = const_args[2];
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        /* Direct jump. */
 -        if (USE_REG_TB) {
 -            /* make sure the patch is 8-byte aligned.  */
 -            if ((intptr_t)s->code_ptr & 4) {
 -                tcg_out_nop(s);
 -            }
 -            set_jmp_insn_offset(s, a0);
 -            tcg_out_sethi(s, TCG_REG_T1, 0);
 -            tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
 -            tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
 -            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
 -        } else {
 -            set_jmp_insn_offset(s, a0);
 -            tcg_out32(s, CALL);
 -            tcg_out_nop(s);
 -        }
 -        set_jmp_reset_offset(s, a0);
 -
 -        /* For the unlinked path of goto_tb, we need to reset
 -           TCG_REG_TB to the beginning of this TB.  */
 -        if (USE_REG_TB) {
 -            c = -tcg_current_code_size(s);
 -            if (check_fit_i32(c, 13)) {
 -                tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
 -            } else {
 -                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
 -                tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB,
 -                              TCG_REG_T1, ARITH_ADD);
 -            }
 -        }
 -        break;
      case INDEX_op_goto_ptr:
          tcg_out_arithi(s, TCG_REG_G0, a0, 0, JMPL);
          if (USE_REG_TB) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
      tcg_out_op_p(s, INDEX_op_exit_tb, (void *)arg);
  }
 +static void tcg_out_goto_tb(TCGContext *s, int which)
 +{
 +    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 +    /* indirect jump method. */
 +    tcg_out_op_p(s, INDEX_op_goto_tb, (void *)get_jmp_target_addr(s, which));
 +    set_jmp_reset_offset(s, which);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      TCGOpcode exts;
      switch (opc) {
 -    case INDEX_op_goto_tb:
 -        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
 -        /* indirect jump method. */
 -        tcg_out_op_p(s, opc, (void *)get_jmp_target_addr(s, args[0]));
 -        set_jmp_reset_offset(s, args[0]);
 -        break;
 -
      case INDEX_op_goto_ptr:
          tcg_out_op_r(s, opc, args[0]);
          break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
 +    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
      default:
          tcg_abort();
      }
 --
 .34.1

-[PULL 17/20] accel/tcg: Introduce tb_pc and log_pc
+[PULL 09/22] tcg: Rename TB_JMP_RESET_OFFSET_INVALID to TB_JMP_OFFSET_INVALID
-The availability of tb->pc will shortly be conditional.
+This will shortly be used for more than reset.
 Introduce accessor functions to minimize ifdefs.
 Pass around a known pc to places like tcg_gen_code,
 where the caller must already have the value.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/internal.h                    |  6 ++++
+ include/exec/exec-all.h   | 2 +-
- include/exec/exec-all.h                 |  6 ++++
+ accel/tcg/translate-all.c | 8 ++++----
- include/tcg/tcg.h                       |  2 +-
+ tcg/tcg.c                 | 4 ++--
- accel/tcg/cpu-exec.c                    | 46 ++++++++++++++-----------
+files changed, 7 insertions(+), 7 deletions(-)
  accel/tcg/translate-all.c               | 37 +++++++++++---------
  target/arm/cpu.c                        |  4 +--
  target/avr/cpu.c                        |  2 +-
  target/hexagon/cpu.c                    |  2 +-
  target/hppa/cpu.c                       |  4 +--
  target/i386/tcg/tcg-cpu.c               |  2 +-
  target/loongarch/cpu.c                  |  2 +-
  target/microblaze/cpu.c                 |  2 +-
  target/mips/tcg/exception.c             |  2 +-
  target/mips/tcg/sysemu/special_helper.c |  2 +-
  target/openrisc/cpu.c                   |  2 +-
  target/riscv/cpu.c                      |  4 +--
  target/rx/cpu.c                         |  2 +-
  target/sh4/cpu.c                        |  4 +--
  target/sparc/cpu.c                      |  2 +-
  target/tricore/cpu.c                    |  2 +-
  tcg/tcg.c                               |  8 ++---
 files changed, 82 insertions(+), 61 deletions(-)
-diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/internal.h
-+++ b/accel/tcg/internal.h
-@@ -XXX,XX +XXX,XX @@ G_NORETURN void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
- void page_init(void);
- void tb_htable_init(void);
-+/* Return the current PC from CPU, which may be cached in TB. */
-+static inline target_ulong log_pc(CPUState *cpu, const TranslationBlock *tb)
-+{
-+    return tb_pc(tb);
-+}
-+
- #endif /* ACCEL_TCG_INTERNAL_H */
 diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/exec-all.h
 +++ b/include/exec/exec-all.h
 @@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
-     uintptr_t jmp_dest[2];
+      * setting one of the jump targets (or patching the jump instruction). Only
- };
+      * two of such jumps are supported.
+      */
-+/* Hide the read to avoid ifdefs for TARGET_TB_PCREL. */
++#define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
-+static inline target_ulong tb_pc(const TranslationBlock *tb)
+     uint16_t jmp_reset_offset[2]; /* offset of original jump target */
-+{
+-#define TB_JMP_RESET_OFFSET_INVALID 0xffff /* indicates no jump generated */
-+    return tb->pc;
+     uintptr_t jmp_target_arg[2];  /* target address or offset */
-+}
-+
+     /*
  /* Hide the qatomic_read to make code a little easier on the eyes */
  static inline uint32_t tb_cflags(const TranslationBlock *tb)
  {
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg.h
 +++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ void tcg_register_thread(void);
  void tcg_prologue_init(TCGContext *s);
  void tcg_func_start(TCGContext *s);
 -int tcg_gen_code(TCGContext *s, TranslationBlock *tb);
 +int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start);
  void tcg_set_frame(TCGContext *s, TCGReg reg, intptr_t start, intptr_t size);
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
      const TranslationBlock *tb = p;
      const struct tb_desc *desc = d;
 -    if (tb->pc == desc->pc &&
 +    if (tb_pc(tb) == desc->pc &&
          tb->page_addr[0] == desc->page_addr0 &&
          tb->cs_base == desc->cs_base &&
          tb->flags == desc->flags &&
@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
      return tb;
  }
 -static inline void log_cpu_exec(target_ulong pc, CPUState *cpu,
 -                                const TranslationBlock *tb)
 +static void log_cpu_exec(target_ulong pc, CPUState *cpu,
 +                         const TranslationBlock *tb)
  {
 -    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_CPU | CPU_LOG_EXEC))
 -        && qemu_log_in_addr_range(pc)) {
 -
 +    if (qemu_log_in_addr_range(pc)) {
          qemu_log_mask(CPU_LOG_EXEC,
                        "Trace %d: %p [" TARGET_FMT_lx
                        "/" TARGET_FMT_lx "/%08x/%08x] %s\n",
@@ -XXX,XX +XXX,XX @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
          return tcg_code_gen_epilogue;
      }
 -    log_cpu_exec(pc, cpu, tb);
 +    if (qemu_loglevel_mask(CPU_LOG_TB_CPU | CPU_LOG_EXEC)) {
 +        log_cpu_exec(pc, cpu, tb);
 +    }
      return tb->tc.ptr;
  }
@@ -XXX,XX +XXX,XX @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
      TranslationBlock *last_tb;
      const void *tb_ptr = itb->tc.ptr;
 -    log_cpu_exec(itb->pc, cpu, itb);
 +    if (qemu_loglevel_mask(CPU_LOG_TB_CPU | CPU_LOG_EXEC)) {
 +        log_cpu_exec(log_pc(cpu, itb), cpu, itb);
 +    }
      qemu_thread_jit_execute();
      ret = tcg_qemu_tb_exec(env, tb_ptr);
@@ -XXX,XX +XXX,XX @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
           * of the start of the TB.
           */
          CPUClass *cc = CPU_GET_CLASS(cpu);
 -        qemu_log_mask_and_addr(CPU_LOG_EXEC, last_tb->pc,
 -                               "Stopped execution of TB chain before %p ["
 -                               TARGET_FMT_lx "] %s\n",
 -                               last_tb->tc.ptr, last_tb->pc,
 -                               lookup_symbol(last_tb->pc));
 +
          if (cc->tcg_ops->synchronize_from_tb) {
              cc->tcg_ops->synchronize_from_tb(cpu, last_tb);
          } else {
              assert(cc->set_pc);
 -            cc->set_pc(cpu, last_tb->pc);
 +            cc->set_pc(cpu, tb_pc(last_tb));
 +        }
 +        if (qemu_loglevel_mask(CPU_LOG_EXEC)) {
 +            target_ulong pc = log_pc(cpu, last_tb);
 +            if (qemu_log_in_addr_range(pc)) {
 +                qemu_log("Stopped execution of TB chain before %p ["
 +                         TARGET_FMT_lx "] %s\n",
 +                         last_tb->tc.ptr, pc, lookup_symbol(pc));
 +            }
          }
      }
@@ -XXX,XX +XXX,XX @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
      qemu_spin_unlock(&tb_next->jmp_lock);
 -    qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,
 -                           "Linking TBs %p [" TARGET_FMT_lx
 -                           "] index %d -> %p [" TARGET_FMT_lx "]\n",
 -                           tb->tc.ptr, tb->pc, n,
 -                           tb_next->tc.ptr, tb_next->pc);
 +    qemu_log_mask(CPU_LOG_EXEC, "Linking TBs %p index %d -> %p\n",
 +                  tb->tc.ptr, n, tb_next->tc.ptr);
      return;
   out_unlock_next:
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_interrupt(CPUState *cpu,
  }
  static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb,
 +                                    target_ulong pc,
                                      TranslationBlock **last_tb, int *tb_exit)
  {
      int32_t insns_left;
 -    trace_exec_tb(tb, tb->pc);
 +    trace_exec_tb(tb, pc);
      tb = cpu_tb_exec(cpu, tb, tb_exit);
      if (*tb_exit != TB_EXIT_REQUESTED) {
          *last_tb = tb;
@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
                  tb_add_jump(last_tb, tb_exit, tb);
              }
 -            cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit);
 +            cpu_loop_exec_tb(cpu, tb, pc, &last_tb, &tb_exit);
              /* Try to align the host and virtual clocks
                 if the guest is in advance */
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
-@@ -XXX,XX +XXX,XX @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
+@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
+     tb->jmp_dest[1] = (uintptr_t)NULL;
-         for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
-             if (i == 0) {
+     /* init original jump addresses which have been set during tcg_gen_code() */
--                prev = (j == 0 ? tb->pc : 0);
+-    if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
-+                prev = (j == 0 ? tb_pc(tb) : 0);
++    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
-             } else {
+         tb_reset_jump(tb, 0);
                  prev = tcg_ctx->gen_insn_data[i - 1][j];
              }
@@ -XXX,XX +XXX,XX @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
  static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
                                       uintptr_t searched_pc, bool reset_icount)
  {
 -    target_ulong data[TARGET_INSN_START_WORDS] = { tb->pc };
 +    target_ulong data[TARGET_INSN_START_WORDS] = { tb_pc(tb) };
      uintptr_t host_pc = (uintptr_t)tb->tc.ptr;
      CPUArchState *env = cpu->env_ptr;
      const uint8_t *p = tb->tc.ptr + tb->tc.size;
@@ -XXX,XX +XXX,XX @@ static bool tb_cmp(const void *ap, const void *bp)
      const TranslationBlock *a = ap;
      const TranslationBlock *b = bp;
 -    return a->pc == b->pc &&
 +    return tb_pc(a) == tb_pc(b) &&
          a->cs_base == b->cs_base &&
          a->flags == b->flags &&
          (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
@@ -XXX,XX +XXX,XX @@ static void do_tb_invalidate_check(void *p, uint32_t hash, void *userp)
      TranslationBlock *tb = p;
      target_ulong addr = *(target_ulong *)userp;
 -    if (!(addr + TARGET_PAGE_SIZE <= tb->pc || addr >= tb->pc + tb->size)) {
 +    if (!(addr + TARGET_PAGE_SIZE <= tb_pc(tb) ||
 +          addr >= tb_pc(tb) + tb->size)) {
          printf("ERROR invalidate: address=" TARGET_FMT_lx
 -               " PC=%08lx size=%04x\n", addr, (long)tb->pc, tb->size);
 +               " PC=%08lx size=%04x\n", addr, (long)tb_pc(tb), tb->size);
      }
- }
+-    if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
++    if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
-@@ -XXX,XX +XXX,XX @@ static void do_tb_page_check(void *p, uint32_t hash, void *userp)
+         tb_reset_jump(tb, 1);
      TranslationBlock *tb = p;
      int flags1, flags2;
 -    flags1 = page_get_flags(tb->pc);
 -    flags2 = page_get_flags(tb->pc + tb->size - 1);
 +    flags1 = page_get_flags(tb_pc(tb));
 +    flags2 = page_get_flags(tb_pc(tb) + tb->size - 1);
      if ((flags1 & PAGE_WRITE) || (flags2 & PAGE_WRITE)) {
          printf("ERROR page flags: PC=%08lx size=%04x f1=%x f2=%x\n",
 -               (long)tb->pc, tb->size, flags1, flags2);
 +               (long)tb_pc(tb), tb->size, flags1, flags2);
      }
- }
+@@ -XXX,XX +XXX,XX @@ static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
-@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
+     if (tb_page_addr1(tb) != -1) {
+         tst->cross_page++;
      /* remove the TB from the hash list */
      phys_pc = tb->page_addr[0];
 -    h = tb_hash_func(phys_pc, tb->pc, tb->flags, orig_cflags,
 +    h = tb_hash_func(phys_pc, tb_pc(tb), tb->flags, orig_cflags,
                       tb->trace_vcpu_dstate);
      if (!qht_remove(&tb_ctx.htable, tb, h)) {
          return;
@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
      }
+-    if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
-     /* add in the hash table */
++    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
--    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags,
+         tst->direct_jmp_count++;
-+    h = tb_hash_func(phys_pc, tb_pc(tb), tb->flags, tb->cflags,
+-        if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
-                      tb->trace_vcpu_dstate);
++        if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
-     qht_insert(&tb_ctx.htable, tb, h, &existing_tb);
+             tst->direct_jmp2_count++;
+         }
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
      tcg_ctx->cpu = NULL;
      max_insns = tb->icount;
 -    trace_translate_block(tb, tb->pc, tb->tc.ptr);
 +    trace_translate_block(tb, pc, tb->tc.ptr);
      /* generate machine code */
      tb->jmp_reset_offset[0] = TB_JMP_RESET_OFFSET_INVALID;
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
      ti = profile_getclock();
  #endif
 -    gen_code_size = tcg_gen_code(tcg_ctx, tb);
 +    gen_code_size = tcg_gen_code(tcg_ctx, tb, pc);
      if (unlikely(gen_code_size < 0)) {
   error_return:
          switch (gen_code_size) {
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
  #ifdef DEBUG_DISAS
      if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
 -        qemu_log_in_addr_range(tb->pc)) {
 +        qemu_log_in_addr_range(pc)) {
          FILE *logfile = qemu_log_trylock();
          if (logfile) {
              int code_size, data_size;
@@ -XXX,XX +XXX,XX @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
       */
      cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_LAST_IO | n;
 -    qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,
 -                           "cpu_io_recompile: rewound execution of TB to "
 -                           TARGET_FMT_lx "\n", tb->pc);
 +    if (qemu_loglevel_mask(CPU_LOG_EXEC)) {
 +        target_ulong pc = log_pc(cpu, tb);
 +        if (qemu_log_in_addr_range(pc)) {
 +            qemu_log("cpu_io_recompile: rewound execution of TB to "
 +                     TARGET_FMT_lx "\n", pc);
 +        }
 +    }
      cpu_loop_exit_noexc(cpu);
  }
 diff --git a/target/arm/cpu.c b/target/arm/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu.c
 +++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ void arm_cpu_synchronize_from_tb(CPUState *cs,
       * never possible for an AArch64 TB to chain to an AArch32 TB.
       */
      if (is_a64(env)) {
 -        env->pc = tb->pc;
 +        env->pc = tb_pc(tb);
      } else {
 -        env->regs[15] = tb->pc;
 +        env->regs[15] = tb_pc(tb);
      }
- }
- #endif /* CONFIG_TCG */
-diff --git a/target/avr/cpu.c b/target/avr/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/avr/cpu.c
-+++ b/target/avr/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void avr_cpu_synchronize_from_tb(CPUState *cs,
-     AVRCPU *cpu = AVR_CPU(cs);
-     CPUAVRState *env = &cpu->env;
--    env->pc_w = tb->pc / 2; /* internally PC points to words */
-+    env->pc_w = tb_pc(tb) / 2; /* internally PC points to words */
- }
- static void avr_cpu_reset(DeviceState *ds)
-diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/hexagon/cpu.c
-+++ b/target/hexagon/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void hexagon_cpu_synchronize_from_tb(CPUState *cs,
- {
-     HexagonCPU *cpu = HEXAGON_CPU(cs);
-     CPUHexagonState *env = &cpu->env;
--    env->gpr[HEX_REG_PC] = tb->pc;
-+    env->gpr[HEX_REG_PC] = tb_pc(tb);
- }
- static bool hexagon_cpu_has_work(CPUState *cs)
-diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/hppa/cpu.c
-+++ b/target/hppa/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_synchronize_from_tb(CPUState *cs,
-     HPPACPU *cpu = HPPA_CPU(cs);
- #ifdef CONFIG_USER_ONLY
--    cpu->env.iaoq_f = tb->pc;
-+    cpu->env.iaoq_f = tb_pc(tb);
-     cpu->env.iaoq_b = tb->cs_base;
- #else
-     /* Recover the IAOQ values from the GVA + PRIV.  */
-@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_synchronize_from_tb(CPUState *cs,
-     int32_t diff = cs_base;
-     cpu->env.iasq_f = iasq_f;
--    cpu->env.iaoq_f = (tb->pc & ~iasq_f) + priv;
-+    cpu->env.iaoq_f = (tb_pc(tb) & ~iasq_f) + priv;
-     if (diff) {
-         cpu->env.iaoq_b = cpu->env.iaoq_f + diff;
-     }
-diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/i386/tcg/tcg-cpu.c
-+++ b/target/i386/tcg/tcg-cpu.c
-@@ -XXX,XX +XXX,XX @@ static void x86_cpu_synchronize_from_tb(CPUState *cs,
- {
-     X86CPU *cpu = X86_CPU(cs);
--    cpu->env.eip = tb->pc - tb->cs_base;
-+    cpu->env.eip = tb_pc(tb) - tb->cs_base;
- }
- #ifndef CONFIG_USER_ONLY
-diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/loongarch/cpu.c
-+++ b/target/loongarch/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void loongarch_cpu_synchronize_from_tb(CPUState *cs,
-     LoongArchCPU *cpu = LOONGARCH_CPU(cs);
-     CPULoongArchState *env = &cpu->env;
--    env->pc = tb->pc;
-+    env->pc = tb_pc(tb);
- }
- #endif /* CONFIG_TCG */
-diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/microblaze/cpu.c
-+++ b/target/microblaze/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void mb_cpu_synchronize_from_tb(CPUState *cs,
- {
-     MicroBlazeCPU *cpu = MICROBLAZE_CPU(cs);
--    cpu->env.pc = tb->pc;
-+    cpu->env.pc = tb_pc(tb);
-     cpu->env.iflags = tb->flags & IFLAGS_TB_MASK;
- }
-diff --git a/target/mips/tcg/exception.c b/target/mips/tcg/exception.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/exception.c
-+++ b/target/mips/tcg/exception.c
-@@ -XXX,XX +XXX,XX @@ void mips_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb)
-     MIPSCPU *cpu = MIPS_CPU(cs);
-     CPUMIPSState *env = &cpu->env;
--    env->active_tc.PC = tb->pc;
-+    env->active_tc.PC = tb_pc(tb);
-     env->hflags &= ~MIPS_HFLAG_BMASK;
-     env->hflags |= tb->flags & MIPS_HFLAG_BMASK;
- }
-diff --git a/target/mips/tcg/sysemu/special_helper.c b/target/mips/tcg/sysemu/special_helper.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/sysemu/special_helper.c
-+++ b/target/mips/tcg/sysemu/special_helper.c
-@@ -XXX,XX +XXX,XX @@ bool mips_io_recompile_replay_branch(CPUState *cs, const TranslationBlock *tb)
-     CPUMIPSState *env = &cpu->env;
-     if ((env->hflags & MIPS_HFLAG_BMASK) != 0
--        && env->active_tc.PC != tb->pc) {
-+        && env->active_tc.PC != tb_pc(tb)) {
-         env->active_tc.PC -= (env->hflags & MIPS_HFLAG_B16 ? 2 : 4);
-         env->hflags &= ~MIPS_HFLAG_BMASK;
-         return true;
-diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/openrisc/cpu.c
-+++ b/target/openrisc/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_synchronize_from_tb(CPUState *cs,
- {
-     OpenRISCCPU *cpu = OPENRISC_CPU(cs);
--    cpu->env.pc = tb->pc;
-+    cpu->env.pc = tb_pc(tb);
- }
-diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/riscv/cpu.c
-+++ b/target/riscv/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_synchronize_from_tb(CPUState *cs,
-     RISCVMXL xl = FIELD_EX32(tb->flags, TB_FLAGS, XL);
-     if (xl == MXL_RV32) {
--        env->pc = (int32_t)tb->pc;
-+        env->pc = (int32_t)tb_pc(tb);
-     } else {
--        env->pc = tb->pc;
-+        env->pc = tb_pc(tb);
-     }
- }
-diff --git a/target/rx/cpu.c b/target/rx/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/rx/cpu.c
-+++ b/target/rx/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void rx_cpu_synchronize_from_tb(CPUState *cs,
- {
-     RXCPU *cpu = RX_CPU(cs);
--    cpu->env.pc = tb->pc;
-+    cpu->env.pc = tb_pc(tb);
- }
- static bool rx_cpu_has_work(CPUState *cs)
-diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/sh4/cpu.c
-+++ b/target/sh4/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void superh_cpu_synchronize_from_tb(CPUState *cs,
- {
-     SuperHCPU *cpu = SUPERH_CPU(cs);
--    cpu->env.pc = tb->pc;
-+    cpu->env.pc = tb_pc(tb);
-     cpu->env.flags = tb->flags & TB_FLAG_ENVFLAGS_MASK;
- }
-@@ -XXX,XX +XXX,XX @@ static bool superh_io_recompile_replay_branch(CPUState *cs,
-     CPUSH4State *env = &cpu->env;
-     if ((env->flags & ((DELAY_SLOT | DELAY_SLOT_CONDITIONAL))) != 0
--        && env->pc != tb->pc) {
-+        && env->pc != tb_pc(tb)) {
-         env->pc -= 2;
-         env->flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
-         return true;
-diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/sparc/cpu.c
-+++ b/target/sparc/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_synchronize_from_tb(CPUState *cs,
- {
-     SPARCCPU *cpu = SPARC_CPU(cs);
--    cpu->env.pc = tb->pc;
-+    cpu->env.pc = tb_pc(tb);
-     cpu->env.npc = tb->cs_base;
- }
-diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/tricore/cpu.c
-+++ b/target/tricore/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_synchronize_from_tb(CPUState *cs,
-     TriCoreCPU *cpu = TRICORE_CPU(cs);
-     CPUTriCoreState *env = &cpu->env;
--    env->PC = tb->pc;
-+    env->PC = tb_pc(tb);
- }
- static void tricore_cpu_reset(DeviceState *dev)
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ int64_t tcg_cpu_exec_time(void)
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
  #endif
+     /* Initialize goto_tb jump offsets. */
--int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
+-    tb->jmp_reset_offset[0] = TB_JMP_RESET_OFFSET_INVALID;
-+int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
+-    tb->jmp_reset_offset[1] = TB_JMP_RESET_OFFSET_INVALID;
- {
++    tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
- #ifdef CONFIG_PROFILER
++    tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
-     TCGProfile *prof = &s->prof;
+     tcg_ctx->tb_jmp_reset_offset = tb->jmp_reset_offset;
-@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
+     if (TCG_TARGET_HAS_direct_jump) {
+         tcg_ctx->tb_jmp_insn_offset = tb->jmp_target_arg;
  #ifdef DEBUG_DISAS
      if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
 -                 && qemu_log_in_addr_range(tb->pc))) {
 +                 && qemu_log_in_addr_range(pc_start))) {
          FILE *logfile = qemu_log_trylock();
          if (logfile) {
              fprintf(logfile, "OP:\n");
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
      if (s->nb_indirects > 0) {
  #ifdef DEBUG_DISAS
          if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_IND)
 -                     && qemu_log_in_addr_range(tb->pc))) {
 +                     && qemu_log_in_addr_range(pc_start))) {
              FILE *logfile = qemu_log_trylock();
              if (logfile) {
                  fprintf(logfile, "OP before indirect lowering:\n");
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
  #ifdef DEBUG_DISAS
      if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT)
 -                 && qemu_log_in_addr_range(tb->pc))) {
 +                 && qemu_log_in_addr_range(pc_start))) {
          FILE *logfile = qemu_log_trylock();
          if (logfile) {
              fprintf(logfile, "OP after optimization and liveness analysis:\n");
 --
 .34.1

-[PULL 11/20] accel/tcg: Use bool for page_find_alloc
+[PULL 10/22] tcg: Add gen_tb to TCGContext
-Bool is more appropriate type for the alloc parameter.
+This can replace four other variables that are references
 into the TranslationBlock structure.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 14 +++++++-------
+ include/tcg/tcg.h         | 11 +++--------
-file changed, 7 insertions(+), 7 deletions(-)
+ accel/tcg/translate-all.c |  2 +-
  tcg/tcg-op.c              | 14 +++++++-------
  tcg/tcg.c                 | 14 +++-----------
 files changed, 14 insertions(+), 27 deletions(-)
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/tcg/tcg.h
++++ b/include/tcg/tcg.h
+@@ -XXX,XX +XXX,XX @@ struct TCGContext {
+     int nb_indirects;
+     int nb_ops;
+-    /* goto_tb support */
+-    tcg_insn_unit *code_buf;
+-    uint16_t *tb_jmp_reset_offset; /* tb->jmp_reset_offset */
+-    uintptr_t *tb_jmp_insn_offset; /* tb->jmp_target_arg if direct_jump */
+-    uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_arg if !direct_jump */
+-
+     TCGRegSet reserved_regs;
+-    uint32_t tb_cflags; /* cflags of the current TB */
+     intptr_t current_frame_offset;
+     intptr_t frame_start;
+     intptr_t frame_end;
+     TCGTemp *frame_temp;
+-    tcg_insn_unit *code_ptr;
++    TranslationBlock *gen_tb;     /* tb for which code is being generated */
++    tcg_insn_unit *code_buf;      /* pointer for start of tb */
++    tcg_insn_unit *code_ptr;      /* pointer for running end of tb */
+ #ifdef CONFIG_PROFILER
+     TCGProfile prof;
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
-@@ -XXX,XX +XXX,XX @@ void page_init(void)
+@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
- #endif
+     tb->trace_vcpu_dstate = *cpu->trace_dstate;
      tb_set_page_addr0(tb, phys_pc);
      tb_set_page_addr1(tb, -1);
 -    tcg_ctx->tb_cflags = cflags;
 +    tcg_ctx->gen_tb = tb;
   tb_overflow:
  #ifdef CONFIG_PROFILER
 diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op.c
 +++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_op6(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3,
  void tcg_gen_mb(TCGBar mb_type)
  {
 -    if (tcg_ctx->tb_cflags & CF_PARALLEL) {
 +    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {
          tcg_gen_op1(INDEX_op_mb, mb_type);
      }
  }
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_exit_tb(const TranslationBlock *tb, unsigned idx)
--static PageDesc *page_find_alloc(tb_page_addr_t index, int alloc)
+ void tcg_gen_goto_tb(unsigned idx)
 +static PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
  {
-     PageDesc *pd;
+     /* We tested CF_NO_GOTO_TB in translator_use_goto_tb. */
-     void **lp;
+-    tcg_debug_assert(!(tcg_ctx->tb_cflags & CF_NO_GOTO_TB));
-@@ -XXX,XX +XXX,XX @@ static PageDesc *page_find_alloc(tb_page_addr_t index, int alloc)
++    tcg_debug_assert(!(tcg_ctx->gen_tb->cflags & CF_NO_GOTO_TB));
+     /* We only support two chained exits.  */
- static inline PageDesc *page_find(tb_page_addr_t index)
+     tcg_debug_assert(idx <= TB_EXIT_IDXMAX);
  #ifdef CONFIG_DEBUG_TCG
@@ -XXX,XX +XXX,XX @@ void tcg_gen_lookup_and_goto_ptr(void)
  {
--    return page_find_alloc(index, 0);
+     TCGv_ptr ptr;
-+    return page_find_alloc(index, false);
 -    if (tcg_ctx->tb_cflags & CF_NO_GOTO_PTR) {
 +    if (tcg_ctx->gen_tb->cflags & CF_NO_GOTO_PTR) {
          tcg_gen_exit_tb(NULL, 0);
          return;
      }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
  {
      memop = tcg_canonicalize_memop(memop, 0, 0);
 -    if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) {
 +    if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) {
          TCGv_i32 t1 = tcg_temp_new_i32();
          TCGv_i32 t2 = tcg_temp_new_i32();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
  {
      memop = tcg_canonicalize_memop(memop, 1, 0);
 -    if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) {
 +    if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) {
          TCGv_i64 t1 = tcg_temp_new_i64();
          TCGv_i64 t2 = tcg_temp_new_i64();
@@ -XXX,XX +XXX,XX @@ static void * const table_##NAME[(MO_SIZE | MO_BSWAP) + 1] = {          \
  void tcg_gen_atomic_##NAME##_i32                                        \
      (TCGv_i32 ret, TCGv addr, TCGv_i32 val, TCGArg idx, MemOp memop)    \
  {                                                                       \
 -    if (tcg_ctx->tb_cflags & CF_PARALLEL) {                             \
 +    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {                        \
          do_atomic_op_i32(ret, addr, val, idx, memop, table_##NAME);     \
      } else {                                                            \
          do_nonatomic_op_i32(ret, addr, val, idx, memop, NEW,            \
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_##NAME##_i32                                        \
  void tcg_gen_atomic_##NAME##_i64                                        \
      (TCGv_i64 ret, TCGv addr, TCGv_i64 val, TCGArg idx, MemOp memop)    \
  {                                                                       \
 -    if (tcg_ctx->tb_cflags & CF_PARALLEL) {                             \
 +    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {                        \
          do_atomic_op_i64(ret, addr, val, idx, memop, table_##NAME);     \
      } else {                                                            \
          do_nonatomic_op_i64(ret, addr, val, idx, memop, NEW,            \
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
       * We will check for overflow at the end of the opcode loop in
       * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
       */
 -    s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
 +    s->gen_tb->jmp_reset_offset[which] = tcg_current_code_size(s);
  }
- static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
+ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
--                           PageDesc **ret_p2, tb_page_addr_t phys2, int alloc);
+@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
-+                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc);
+      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
+      */
- /* In user-mode page locks aren't used; mmap_lock is enough */
+     tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
- #ifdef CONFIG_USER_ONLY
+-    s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
-@@ -XXX,XX +XXX,XX @@ static inline void page_unlock(PageDesc *pd)
++    s->gen_tb->jmp_target_arg[which] = tcg_current_code_size(s);
  /* lock the page(s) of a TB in the correct acquisition order */
  static inline void page_lock_tb(const TranslationBlock *tb)
  {
 -    page_lock_pair(NULL, tb->page_addr[0], NULL, tb->page_addr[1], 0);
 +    page_lock_pair(NULL, tb->page_addr[0], NULL, tb->page_addr[1], false);
  }
- static inline void page_unlock_tb(const TranslationBlock *tb)
+ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
-@@ -XXX,XX +XXX,XX @@ void page_collection_unlock(struct page_collection *set)
+@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
- #endif /* !CONFIG_USER_ONLY */
+      * Return the read-execute version of the pointer, for the benefit
+      * of any pc-relative addressing mode.
  static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
 -                           PageDesc **ret_p2, tb_page_addr_t phys2, int alloc)
 +                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc)
  {
      PageDesc *p1, *p2;
      tb_page_addr_t page1;
@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
       * Note that inserting into the hash table first isn't an option, since
       * we can only insert TBs that are fully initialized.
       */
--    page_lock_pair(&p, phys_pc, &p2, phys_page2, 1);
+-    return (uintptr_t)tcg_splitwx_to_rx(&s->tb_jmp_target_addr[which]);
-+    page_lock_pair(&p, phys_pc, &p2, phys_page2, true);
++    return (uintptr_t)tcg_splitwx_to_rx(s->gen_tb->jmp_target_arg + which);
-     tb_page_add(p, tb, 0, phys_pc & TARGET_PAGE_MASK);
+ }
-     if (p2) {
-         tb_page_add(p2, tb, 1, phys_page2);
+ /* Signal overflow, starting over with fewer guest insns. */
-@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
-     for (addr = start, len = end - start;
+     /* Initialize goto_tb jump offsets. */
-          len != 0;
+     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
-          len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
+     tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
--        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, 1);
+-    tcg_ctx->tb_jmp_reset_offset = tb->jmp_reset_offset;
-+        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, true);
+-    if (TCG_TARGET_HAS_direct_jump) {
+-        tcg_ctx->tb_jmp_insn_offset = tb->jmp_target_arg;
-         /* If the write protection bit is set, then we invalidate
+-        tcg_ctx->tb_jmp_target_addr = NULL;
-            the code inside.  */
+-    } else {
 -        tcg_ctx->tb_jmp_insn_offset = NULL;
 -        tcg_ctx->tb_jmp_target_addr = tb->jmp_target_arg;
 -    }
      tcg_reg_alloc_start(s);
 --
 .34.1

-[PULL 18/20] accel/tcg: Introduce TARGET_TB_PCREL
+[PULL 11/22] tcg: Add TranslationBlock.jmp_insn_offset
-Prepare for targets to be able to produce TBs that can
+Stop overloading jmp_target_arg for both offset and address,
-run in more than one virtual context.
+depending on TCG_TARGET_HAS_direct_jump.  Instead, add a new
 field to hold the jump insn offset and always set the target
 address in jmp_target_addr[].  This will allow a tcg backend
 to use either direct or indirect depending on displacement.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/internal.h      |  4 +++
+ include/exec/exec-all.h | 3 ++-
- accel/tcg/tb-jmp-cache.h  | 41 +++++++++++++++++++++++++
+ accel/tcg/cpu-exec.c    | 5 ++---
- include/exec/cpu-defs.h   |  3 ++
+ tcg/tcg.c               | 6 ++++--
- include/exec/exec-all.h   | 32 ++++++++++++++++++--
+files changed, 8 insertions(+), 6 deletions(-)
  accel/tcg/cpu-exec.c      | 16 ++++++----
  accel/tcg/translate-all.c | 64 ++++++++++++++++++++++++++-------------
 files changed, 131 insertions(+), 29 deletions(-)
-diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/internal.h
-+++ b/accel/tcg/internal.h
-@@ -XXX,XX +XXX,XX @@ void tb_htable_init(void);
- /* Return the current PC from CPU, which may be cached in TB. */
- static inline target_ulong log_pc(CPUState *cpu, const TranslationBlock *tb)
- {
-+#if TARGET_TB_PCREL
-+    return cpu->cc->get_pc(cpu);
-+#else
-     return tb_pc(tb);
-+#endif
- }
- #endif /* ACCEL_TCG_INTERNAL_H */
-diff --git a/accel/tcg/tb-jmp-cache.h b/accel/tcg/tb-jmp-cache.h
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tb-jmp-cache.h
-+++ b/accel/tcg/tb-jmp-cache.h
-@@ -XXX,XX +XXX,XX @@
- /*
-  * Accessed in parallel; all accesses to 'tb' must be atomic.
-+ * For TARGET_TB_PCREL, accesses to 'pc' must be protected by
-+ * a load_acquire/store_release to 'tb'.
-  */
- struct CPUJumpCache {
-     struct {
-         TranslationBlock *tb;
-+#if TARGET_TB_PCREL
-+        target_ulong pc;
-+#endif
-     } array[TB_JMP_CACHE_SIZE];
- };
-+static inline TranslationBlock *
-+tb_jmp_cache_get_tb(CPUJumpCache *jc, uint32_t hash)
-+{
-+#if TARGET_TB_PCREL
-+    /* Use acquire to ensure current load of pc from jc. */
-+    return qatomic_load_acquire(&jc->array[hash].tb);
-+#else
-+    /* Use rcu_read to ensure current load of pc from *tb. */
-+    return qatomic_rcu_read(&jc->array[hash].tb);
-+#endif
-+}
-+
-+static inline target_ulong
-+tb_jmp_cache_get_pc(CPUJumpCache *jc, uint32_t hash, TranslationBlock *tb)
-+{
-+#if TARGET_TB_PCREL
-+    return jc->array[hash].pc;
-+#else
-+    return tb_pc(tb);
-+#endif
-+}
-+
-+static inline void
-+tb_jmp_cache_set(CPUJumpCache *jc, uint32_t hash,
-+                 TranslationBlock *tb, target_ulong pc)
-+{
-+#if TARGET_TB_PCREL
-+    jc->array[hash].pc = pc;
-+    /* Use store_release on tb to ensure pc is written first. */
-+    qatomic_store_release(&jc->array[hash].tb, tb);
-+#else
-+    /* Use the pc value already stored in tb->pc. */
-+    qatomic_set(&jc->array[hash].tb, tb);
-+#endif
-+}
-+
- #endif /* ACCEL_TCG_TB_JMP_CACHE_H */
-diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-defs.h
-+++ b/include/exec/cpu-defs.h
-@@ -XXX,XX +XXX,XX @@
- #  error TARGET_PAGE_BITS must be defined in cpu-param.h
- # endif
- #endif
-+#ifndef TARGET_TB_PCREL
-+# define TARGET_TB_PCREL 0
-+#endif
- #define TARGET_LONG_SIZE (TARGET_LONG_BITS / 8)
 diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/exec-all.h
 +++ b/include/exec/exec-all.h
-@@ -XXX,XX +XXX,XX @@ struct tb_tc {
- };
- struct TranslationBlock {
--    target_ulong pc;   /* simulated PC corresponding to this block (EIP + CS base) */
--    target_ulong cs_base; /* CS base for this block */
-+#if !TARGET_TB_PCREL
-+    /*
-+     * Guest PC corresponding to this block.  This must be the true
-+     * virtual address.  Therefore e.g. x86 stores EIP + CS_BASE, and
-+     * targets like Arm, MIPS, HP-PA, which reuse low bits for ISA or
-+     * privilege, must store those bits elsewhere.
-+     *
-+     * If TARGET_TB_PCREL, the opcodes for the TranslationBlock are
-+     * written such that the TB is associated only with the physical
-+     * page and may be run in any virtual address context.  In this case,
-+     * PC must always be taken from ENV in a target-specific manner.
-+     * Unwind information is taken as offsets from the page, to be
-+     * deposited into the "current" PC.
-+     */
-+    target_ulong pc;
-+#endif
-+
-+    /*
-+     * Target-specific data associated with the TranslationBlock, e.g.:
-+     * x86: the original user, the Code Segment virtual base,
-+     * arm: an extension of tb->flags,
-+     * s390x: instruction data for EXECUTE,
-+     * sparc: the next pc of the instruction queue (for delay slots).
-+     */
-+    target_ulong cs_base;
-+
-     uint32_t flags; /* flags defining in which context the code was generated */
-     uint32_t cflags;    /* compile flags */
 @@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
- /* Hide the read to avoid ifdefs for TARGET_TB_PCREL. */
+      */
- static inline target_ulong tb_pc(const TranslationBlock *tb)
+ #define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
- {
+     uint16_t jmp_reset_offset[2]; /* offset of original jump target */
-+#if TARGET_TB_PCREL
+-    uintptr_t jmp_target_arg[2];  /* target address or offset */
-+    qemu_build_not_reached();
++    uint16_t jmp_insn_offset[2];  /* offset of direct jump insn */
-+#else
++    uintptr_t jmp_target_addr[2]; /* target address */
-     return tb->pc;
-+#endif
+     /*
- }
+      * Each TB has a NULL-terminated list (jmp_list_head) of incoming jumps.
  /* Hide the qatomic_read to make code a little easier on the eyes */
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
-@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
+@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
-     const TranslationBlock *tb = p;
-     const struct tb_desc *desc = d;
+ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
+ {
--    if (tb_pc(tb) == desc->pc &&
++    tb->jmp_target_addr[n] = addr;
-+    if ((TARGET_TB_PCREL || tb_pc(tb) == desc->pc) &&
+     if (TCG_TARGET_HAS_direct_jump) {
-         tb->page_addr[0] == desc->page_addr0 &&
+-        uintptr_t offset = tb->jmp_target_arg[n];
-         tb->cs_base == desc->cs_base &&
++        uintptr_t offset = tb->jmp_insn_offset[n];
-         tb->flags == desc->flags &&
+         uintptr_t tc_ptr = (uintptr_t)tb->tc.ptr;
-@@ -XXX,XX +XXX,XX @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
+         uintptr_t jmp_rx = tc_ptr + offset;
-         return NULL;
+         uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
          tb_target_set_jmp_target(tc_ptr, jmp_rx, jmp_rw, addr);
 -    } else {
 -        tb->jmp_target_arg[n] = addr;
      }
-     desc.page_addr0 = phys_pc;
--    h = tb_hash_func(phys_pc, pc, flags, cflags, *cpu->trace_dstate);
-+    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : pc),
-+                     flags, cflags, *cpu->trace_dstate);
-     return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);
  }
-@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
+diff --git a/tcg/tcg.c b/tcg/tcg.c
-                                           uint32_t flags, uint32_t cflags)
+index XXXXXXX..XXXXXXX 100644
- {
+--- a/tcg/tcg.c
-     TranslationBlock *tb;
++++ b/tcg/tcg.c
-+    CPUJumpCache *jc;
+@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
-     uint32_t hash;
+      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
+      */
-     /* we should never be trying to look up an INVALID tb */
+     tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
-     tcg_debug_assert(!(cflags & CF_INVALID));
+-    s->gen_tb->jmp_target_arg[which] = tcg_current_code_size(s);
++    s->gen_tb->jmp_insn_offset[which] = tcg_current_code_size(s);
      hash = tb_jmp_cache_hash_func(pc);
 -    tb = qatomic_rcu_read(&cpu->tb_jmp_cache->array[hash].tb);
 +    jc = cpu->tb_jmp_cache;
 +    tb = tb_jmp_cache_get_tb(jc, hash);
      if (likely(tb &&
 -               tb->pc == pc &&
 +               tb_jmp_cache_get_pc(jc, hash, tb) == pc &&
                 tb->cs_base == cs_base &&
                 tb->flags == flags &&
                 tb->trace_vcpu_dstate == *cpu->trace_dstate &&
@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
      if (tb == NULL) {
          return NULL;
      }
 -    qatomic_set(&cpu->tb_jmp_cache->array[hash].tb, tb);
 +    tb_jmp_cache_set(jc, hash, tb, pc);
      return tb;
  }
-@@ -XXX,XX +XXX,XX @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
+ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
-         if (cc->tcg_ops->synchronize_from_tb) {
+@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
-             cc->tcg_ops->synchronize_from_tb(cpu, last_tb);
+      * Return the read-execute version of the pointer, for the benefit
-         } else {
+      * of any pc-relative addressing mode.
-+            assert(!TARGET_TB_PCREL);
+      */
-             assert(cc->set_pc);
+-    return (uintptr_t)tcg_splitwx_to_rx(s->gen_tb->jmp_target_arg + which);
-             cc->set_pc(cpu, tb_pc(last_tb));
++    return (uintptr_t)tcg_splitwx_to_rx(&s->gen_tb->jmp_target_addr[which]);
          }
@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
                   * for the fast lookup
                   */
                  h = tb_jmp_cache_hash_func(pc);
 -                qatomic_set(&cpu->tb_jmp_cache->array[h].tb, tb);
 +                tb_jmp_cache_set(cpu->tb_jmp_cache, h, tb, pc);
              }
  #ifndef CONFIG_USER_ONLY
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
          for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
              if (i == 0) {
 -                prev = (j == 0 ? tb_pc(tb) : 0);
 +                prev = (!TARGET_TB_PCREL && j == 0 ? tb_pc(tb) : 0);
              } else {
                  prev = tcg_ctx->gen_insn_data[i - 1][j];
              }
@@ -XXX,XX +XXX,XX @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
  static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
                                       uintptr_t searched_pc, bool reset_icount)
  {
 -    target_ulong data[TARGET_INSN_START_WORDS] = { tb_pc(tb) };
 +    target_ulong data[TARGET_INSN_START_WORDS];
      uintptr_t host_pc = (uintptr_t)tb->tc.ptr;
      CPUArchState *env = cpu->env_ptr;
      const uint8_t *p = tb->tc.ptr + tb->tc.size;
@@ -XXX,XX +XXX,XX @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
          return -1;
      }
 +    memset(data, 0, sizeof(data));
 +    if (!TARGET_TB_PCREL) {
 +        data[0] = tb_pc(tb);
 +    }
 +
      /* Reconstruct the stored insn data while looking for the point at
         which the end of the insn exceeds the searched_pc.  */
      for (i = 0; i < num_insns; ++i) {
@@ -XXX,XX +XXX,XX @@ static bool tb_cmp(const void *ap, const void *bp)
      const TranslationBlock *a = ap;
      const TranslationBlock *b = bp;
 -    return tb_pc(a) == tb_pc(b) &&
 -        a->cs_base == b->cs_base &&
 -        a->flags == b->flags &&
 -        (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
 -        a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
 -        a->page_addr[0] == b->page_addr[0] &&
 -        a->page_addr[1] == b->page_addr[1];
 +    return ((TARGET_TB_PCREL || tb_pc(a) == tb_pc(b)) &&
 +            a->cs_base == b->cs_base &&
 +            a->flags == b->flags &&
 +            (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
 +            a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
 +            a->page_addr[0] == b->page_addr[0] &&
 +            a->page_addr[1] == b->page_addr[1]);
  }
- void tb_htable_init(void)
+ /* Signal overflow, starting over with fewer guest insns. */
-@@ -XXX,XX +XXX,XX @@ static inline void tb_jmp_unlink(TranslationBlock *dest)
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
-     qemu_spin_unlock(&dest->jmp_lock);
+     /* Initialize goto_tb jump offsets. */
- }
+     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
+     tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
-+static void tb_jmp_cache_inval_tb(TranslationBlock *tb)
++    tb->jmp_insn_offset[0] = TB_JMP_OFFSET_INVALID;
-+{
++    tb->jmp_insn_offset[1] = TB_JMP_OFFSET_INVALID;
-+    CPUState *cpu;
-+
+     tcg_reg_alloc_start(s);
-+    if (TARGET_TB_PCREL) {
 +        /* A TB may be at any virtual address */
 +        CPU_FOREACH(cpu) {
 +            tcg_flush_jmp_cache(cpu);
 +        }
 +    } else {
 +        uint32_t h = tb_jmp_cache_hash_func(tb_pc(tb));
 +
 +        CPU_FOREACH(cpu) {
 +            CPUJumpCache *jc = cpu->tb_jmp_cache;
 +
 +            if (qatomic_read(&jc->array[h].tb) == tb) {
 +                qatomic_set(&jc->array[h].tb, NULL);
 +            }
 +        }
 +    }
 +}
 +
  /*
   * In user-mode, call with mmap_lock held.
   * In !user-mode, if @rm_from_page_list is set, call with the TB's pages'
@@ -XXX,XX +XXX,XX @@ static inline void tb_jmp_unlink(TranslationBlock *dest)
   */
  static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
  {
 -    CPUState *cpu;
      PageDesc *p;
      uint32_t h;
      tb_page_addr_t phys_pc;
@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
      /* remove the TB from the hash list */
      phys_pc = tb->page_addr[0];
 -    h = tb_hash_func(phys_pc, tb_pc(tb), tb->flags, orig_cflags,
 -                     tb->trace_vcpu_dstate);
 +    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
 +                     tb->flags, orig_cflags, tb->trace_vcpu_dstate);
      if (!qht_remove(&tb_ctx.htable, tb, h)) {
          return;
      }
@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
      }
      /* remove the TB from the hash list */
 -    h = tb_jmp_cache_hash_func(tb->pc);
 -    CPU_FOREACH(cpu) {
 -        CPUJumpCache *jc = cpu->tb_jmp_cache;
 -        if (qatomic_read(&jc->array[h].tb) == tb) {
 -            qatomic_set(&jc->array[h].tb, NULL);
 -        }
 -    }
 +    tb_jmp_cache_inval_tb(tb);
      /* suppress this TB from the two jump lists */
      tb_remove_from_jmp_list(tb, 0);
@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
      }
      /* add in the hash table */
 -    h = tb_hash_func(phys_pc, tb_pc(tb), tb->flags, tb->cflags,
 -                     tb->trace_vcpu_dstate);
 +    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
 +                     tb->flags, tb->cflags, tb->trace_vcpu_dstate);
      qht_insert(&tb_ctx.htable, tb, h, &existing_tb);
      /* remove TB from the page(s) if we couldn't insert it */
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
      gen_code_buf = tcg_ctx->code_gen_ptr;
      tb->tc.ptr = tcg_splitwx_to_rx(gen_code_buf);
 +#if !TARGET_TB_PCREL
      tb->pc = pc;
 +#endif
      tb->cs_base = cs_base;
      tb->flags = flags;
      tb->cflags = cflags;
 --
 .34.1

-[PULL 13/20] accel/tcg: Do not align tb->page_addr[0]
+[PULL 12/22] tcg: Change tb_target_set_jmp_target arguments
-Let tb->page_addr[0] contain the address of the first byte of the
+Replace 'tc_ptr' and 'addr' with 'tb' and 'n'.
 translated block, rather than the address of the page containing the
 start of the translated block.  We need to recover this value anyway
 at various points, and it is easier to discard a page offset when it
 is not needed, which happens naturally via the existing find_page shift.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cpu-exec.c      | 16 ++++++++--------
+ tcg/aarch64/tcg-target.h         |  3 ++-
- accel/tcg/cputlb.c        |  3 ++-
+ tcg/arm/tcg-target.h             |  3 ++-
- accel/tcg/translate-all.c |  9 +++++----
+ tcg/i386/tcg-target.h            |  9 ++-------
-files changed, 15 insertions(+), 13 deletions(-)
+ tcg/loongarch64/tcg-target.h     |  3 ++-
  tcg/mips/tcg-target.h            |  3 ++-
  tcg/ppc/tcg-target.h             |  3 ++-
  tcg/riscv/tcg-target.h           |  3 ++-
  tcg/s390x/tcg-target.h           | 10 ++--------
  tcg/sparc64/tcg-target.h         |  3 ++-
  tcg/tci/tcg-target.h             |  3 ++-
  accel/tcg/cpu-exec.c             | 11 ++++++++---
  tcg/aarch64/tcg-target.c.inc     |  5 +++--
  tcg/i386/tcg-target.c.inc        |  9 +++++++++
  tcg/loongarch64/tcg-target.c.inc |  5 +++--
  tcg/ppc/tcg-target.c.inc         |  7 ++++---
  tcg/s390x/tcg-target.c.inc       | 10 ++++++++++
  tcg/sparc64/tcg-target.c.inc     |  7 ++++---
 files changed, 61 insertions(+), 36 deletions(-)
+diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.h
++++ b/tcg/aarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     0
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *, int,
++                              uintptr_t, uintptr_t);
+ #define TCG_TARGET_NEED_LDST_LABELS
+ #define TCG_TARGET_NEED_POOL_LABELS
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.h
++++ b/tcg/arm/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     0
+ /* not defined -- call should be eliminated at compile time */
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t);
+ #define TCG_TARGET_NEED_LDST_LABELS
+ #define TCG_TARGET_NEED_POOL_LABELS
+diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.h
++++ b/tcg/i386/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
+ #define TCG_TARGET_extract_i64_valid(ofs, len) \
+     (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
+-static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+-                                            uintptr_t jmp_rw, uintptr_t addr)
+-{
+-    /* patch the branch destination */
+-    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
+-    /* no need to flush icache explicitly */
+-}
++void tb_target_set_jmp_target(const TranslationBlock *, int,
++                              uintptr_t, uintptr_t);
+ /* This defines the natural memory order supported by this
+  * architecture before guarantees made by various barrier
+diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.h
++++ b/tcg/loongarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_HAS_muluh_i64        1
+ #define TCG_TARGET_HAS_mulsh_i64        1
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t);
+ #define TCG_TARGET_DEFAULT_MO (0)
+diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/mips/tcg-target.h
++++ b/tcg/mips/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     1
+ /* not defined -- call should be eliminated at compile time */
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t)
+     QEMU_ERROR("code path is reachable");
+ #define TCG_TARGET_NEED_LDST_LABELS
+diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.h
++++ b/tcg/ppc/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
+ #define TCG_TARGET_HAS_bitsel_vec       have_vsx
+ #define TCG_TARGET_HAS_cmpsel_vec       0
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t);
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     1
+diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/riscv/tcg-target.h
++++ b/tcg/riscv/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #endif
+ /* not defined -- call should be eliminated at compile time */
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t);
+ #define TCG_TARGET_DEFAULT_MO (0)
+diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/s390x/tcg-target.h
++++ b/tcg/s390x/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
+ #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
+-static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+-                                            uintptr_t jmp_rw, uintptr_t addr)
+-{
+-    /* patch the branch destination */
+-    intptr_t disp = addr - (jmp_rx - 2);
+-    qatomic_set((int32_t *)jmp_rw, disp / 2);
+-    /* no need to flush icache explicitly */
+-}
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw);
+ #define TCG_TARGET_NEED_LDST_LABELS
+ #define TCG_TARGET_NEED_POOL_LABELS
+diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc64/tcg-target.h
++++ b/tcg/sparc64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     1
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t);
+ #define TCG_TARGET_NEED_POOL_LABELS
+diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.h
++++ b/tcg/tci/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     1
+ /* not defined -- call should be eliminated at compile time */
+-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t, uintptr_t);
+ #endif /* TCG_TARGET_H */
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
-@@ -XXX,XX +XXX,XX @@ struct tb_desc {
+@@ -XXX,XX +XXX,XX @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
-     target_ulong pc;
+ {
-     target_ulong cs_base;
+     tb->jmp_target_addr[n] = addr;
-     CPUArchState *env;
+     if (TCG_TARGET_HAS_direct_jump) {
--    tb_page_addr_t phys_page1;
++        /*
-+    tb_page_addr_t page_addr0;
++         * Get the rx view of the structure, from which we find the
-     uint32_t flags;
++         * executable code address, and tb_target_set_jmp_target can
-     uint32_t cflags;
++         * produce a pc-relative displacement to jmp_target_addr[n].
-     uint32_t trace_vcpu_dstate;
++         */
-@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
++        const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
-     const struct tb_desc *desc = d;
+         uintptr_t offset = tb->jmp_insn_offset[n];
+-        uintptr_t tc_ptr = (uintptr_t)tb->tc.ptr;
-     if (tb->pc == desc->pc &&
+-        uintptr_t jmp_rx = tc_ptr + offset;
--        tb->page_addr[0] == desc->phys_page1 &&
++        uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
-+        tb->page_addr[0] == desc->page_addr0 &&
+         uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
-         tb->cs_base == desc->cs_base &&
+-        tb_target_set_jmp_target(tc_ptr, jmp_rx, jmp_rw, addr);
-         tb->flags == desc->flags &&
++        tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
          tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
          if (tb->page_addr[1] == -1) {
              return true;
          } else {
 -            tb_page_addr_t phys_page2;
 -            target_ulong virt_page2;
 +            tb_page_addr_t phys_page1;
 +            target_ulong virt_page1;
              /*
               * We know that the first page matched, and an otherwise valid TB
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
               * is different for the new TB.  Therefore any exception raised
               * here by the faulting lookup is not premature.
               */
 -            virt_page2 = TARGET_PAGE_ALIGN(desc->pc);
 -            phys_page2 = get_page_addr_code(desc->env, virt_page2);
 -            if (tb->page_addr[1] == phys_page2) {
 +            virt_page1 = TARGET_PAGE_ALIGN(desc->pc);
 +            phys_page1 = get_page_addr_code(desc->env, virt_page1);
 +            if (tb->page_addr[1] == phys_page1) {
                  return true;
              }
          }
@@ -XXX,XX +XXX,XX @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
      if (phys_pc == -1) {
          return NULL;
      }
--    desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
+ }
-+    desc.page_addr0 = phys_pc;
-     h = tb_hash_func(phys_pc, pc, flags, cflags, *cpu->trace_dstate);
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
-     return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);
+index XXXXXXX..XXXXXXX 100644
- }
+--- a/tcg/aarch64/tcg-target.c.inc
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
++++ b/tcg/aarch64/tcg-target.c.inc
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
---- a/accel/tcg/cputlb.c
+     tcg_out_call_int(s, target);
-+++ b/accel/tcg/cputlb.c
+ }
-@@ -XXX,XX +XXX,XX @@ void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
-    can be detected */
+-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
- void tlb_protect_code(ram_addr_t ram_addr)
+-                              uintptr_t jmp_rw, uintptr_t addr)
- {
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
--    cpu_physical_memory_test_and_clear_dirty(ram_addr, TARGET_PAGE_SIZE,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-+    cpu_physical_memory_test_and_clear_dirty(ram_addr & TARGET_PAGE_MASK,
+ {
-+                                             TARGET_PAGE_SIZE,
++    uintptr_t addr = tb->jmp_target_addr[n];
-                                              DIRTY_MEMORY_CODE);
+     tcg_insn_unit i1, i2;
- }
+     TCGType rt = TCG_TYPE_I64;
+     TCGReg  rd = TCG_REG_TMP;
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/tcg/i386/tcg-target.c.inc
-+++ b/accel/tcg/translate-all.c
++++ b/tcg/i386/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
-     qemu_spin_unlock(&tb->jmp_lock);
+     set_jmp_reset_offset(s, which);
+ }
-     /* remove the TB from the hash list */
--    phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-+    phys_pc = tb->page_addr[0];
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-     h = tb_hash_func(phys_pc, tb->pc, tb->flags, orig_cflags,
++{
-                      tb->trace_vcpu_dstate);
++    /* patch the branch destination */
-     if (!qht_remove(&tb_ctx.htable, tb, h)) {
++    uintptr_t addr = tb->jmp_target_addr[n];
-@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
++    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
-      * we can only insert TBs that are fully initialized.
++    /* no need to flush icache explicitly */
-      */
++}
-     page_lock_pair(&p, phys_pc, &p2, phys_page2, true);
++
--    tb_page_add(p, tb, 0, phys_pc & TARGET_PAGE_MASK);
+ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
-+    tb_page_add(p, tb, 0, phys_pc);
+                               const TCGArg args[TCG_MAX_OP_ARGS],
-     if (p2) {
+                               const int const_args[TCG_MAX_OP_ARGS])
-         tb_page_add(p2, tb, 1, phys_page2);
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
-     } else {
+index XXXXXXX..XXXXXXX 100644
-@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
+--- a/tcg/loongarch64/tcg-target.c.inc
-         if (n == 0) {
++++ b/tcg/loongarch64/tcg-target.c.inc
-             /* NOTE: tb_end may be after the end of the page, but
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop(TCGContext *s)
-                it is not a problem */
+     tcg_out32(s, NOP);
--            tb_start = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
+ }
-+            tb_start = tb->page_addr[0];
-             tb_end = tb_start + tb->size;
+-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-         } else {
+-                              uintptr_t jmp_rw, uintptr_t addr)
-             tb_start = tb->page_addr[1];
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
--            tb_end = tb_start + ((tb->pc + tb->size) & ~TARGET_PAGE_MASK);
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-+            tb_end = tb_start + ((tb->page_addr[0] + tb->size)
+ {
-+                                 & ~TARGET_PAGE_MASK);
+     tcg_insn_unit i1, i2;
-         }
+     ptrdiff_t upper, lower;
-         if (!(tb_end <= start || tb_start >= end)) {
++    uintptr_t addr = tb->jmp_target_addr[n];
- #ifdef TARGET_HAS_PRECISE_SMC
+     ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
      if (offset == sextreg(offset, 0, 26)) {
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
      flush_idcache_range(rx, rw, 16);
  }
 -void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
 -                              uintptr_t jmp_rw, uintptr_t addr)
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
  {
      tcg_insn_unit i0, i1, i2, i3;
 -    intptr_t tb_diff = addr - tc_ptr;
 +    uintptr_t addr = tb->jmp_target_addr[n];
 +    intptr_t tb_diff = addr - (uintptr_t)tb->tc.ptr;
      intptr_t br_diff = addr - (jmp_rx + 4);
      intptr_t lo, hi;
 diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.c.inc
 +++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
      set_jmp_reset_offset(s, which);
  }
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
 +    /* patch the branch destination */
 +    uintptr_t addr = tb->jmp_target_addr[n];
 +    intptr_t disp = addr - (jmp_rx - 2);
 +    qatomic_set((int32_t *)jmp_rw, disp / 2);
 +    /* no need to flush icache explicitly */
 +}
 +
  # define OP_32_64(x) \
          case glue(glue(INDEX_op_,x),_i32): \
          case glue(glue(INDEX_op_,x),_i64)
 diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.c.inc
 +++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tcg_register_jit(const void *buf, size_t buf_size)
      tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
  }
 -void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
 -                              uintptr_t jmp_rw, uintptr_t addr)
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
  {
 -    intptr_t tb_disp = addr - tc_ptr;
 +    uintptr_t addr = tb->jmp_target_addr[n];
 +    intptr_t tb_disp = addr - (uintptr_t)tb->tc.ptr;
      intptr_t br_disp = addr - jmp_rx;
      tcg_insn_unit i1, i2;
 --
 .34.1

-[PULL 01/20] cpu: cache CPUClass in CPUState for hot code paths
+[PULL 13/22] tcg: Move tb_target_set_jmp_target declaration to tcg.h
-From: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 The class cast checkers are quite expensive and always on (unlike the
 dynamic case who's checks are gated by CONFIG_QOM_CAST_DEBUG). To
 avoid the overhead of repeatedly checking something which should never
 change we cache the CPUClass reference for use in the hot code paths.
 Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20220811151413.3350684-3-alex.bennee@linaro.org>
 Signed-off-by: Cédric Le Goater <clg@kaod.org>
 Message-Id: <20220923084803.498337-3-clg@kaod.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h | 9 +++++++++
+ include/tcg/tcg.h            | 3 +++
- cpu.c                 | 9 ++++-----
+ tcg/aarch64/tcg-target.h     | 4 ----
-files changed, 13 insertions(+), 5 deletions(-)
+ tcg/arm/tcg-target.h         | 5 -----
  tcg/i386/tcg-target.h        | 3 ---
  tcg/loongarch64/tcg-target.h | 3 ---
  tcg/mips/tcg-target.h        | 5 -----
  tcg/ppc/tcg-target.h         | 4 ----
  tcg/riscv/tcg-target.h       | 4 ----
  tcg/s390x/tcg-target.h       | 4 ----
  tcg/sparc64/tcg-target.h     | 4 ----
  tcg/tci/tcg-target.h         | 4 ----
 files changed, 3 insertions(+), 40 deletions(-)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/include/tcg/tcg.h
-+++ b/include/hw/core/cpu.h
++++ b/include/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@ typedef int (*WriteCoreDumpFunction)(const void *buf, size_t size,
+@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s);
-  */
- #define CPU(obj) ((CPUState *)(obj))
+ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start);
-+/*
++void tb_target_set_jmp_target(const TranslationBlock *, int,
-+ * The class checkers bring in CPU_GET_CLASS() which is potentially
++                              uintptr_t, uintptr_t);
-+ * expensive given the eventual call to
++
-+ * object_class_dynamic_cast_assert(). Because of this the CPUState
+ void tcg_set_frame(TCGContext *s, TCGReg reg, intptr_t start, intptr_t size);
-+ * has a cached value for the class in cs->cc which is set up in
-+ * cpu_exec_realizefn() for use in hot code paths.
+ TCGTemp *tcg_global_mem_new_internal(TCGType, TCGv_ptr,
-+ */
+diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
  typedef struct CPUClass CPUClass;
  DECLARE_CLASS_CHECKERS(CPUClass, CPU,
                         TYPE_CPU)
@@ -XXX,XX +XXX,XX @@ struct qemu_work_item;
  struct CPUState {
      /*< private >*/
      DeviceState parent_obj;
 +    /* cache to avoid expensive CPU_GET_CLASS */
 +    CPUClass *cc;
      /*< public >*/
      int nr_cores;
 diff --git a/cpu.c b/cpu.c
 index XXXXXXX..XXXXXXX 100644
---- a/cpu.c
+--- a/tcg/aarch64/tcg-target.h
-+++ b/cpu.c
++++ b/tcg/aarch64/tcg-target.h
-@@ -XXX,XX +XXX,XX @@ const VMStateDescription vmstate_cpu_common = {
+@@ -XXX,XX +XXX,XX @@ typedef enum {
- void cpu_exec_realizefn(CPUState *cpu, Error **errp)
+ #define TCG_TARGET_DEFAULT_MO (0)
- {
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     0
--#ifndef CONFIG_USER_ONLY
+-
--    CPUClass *cc = CPU_GET_CLASS(cpu);
+-void tb_target_set_jmp_target(const TranslationBlock *, int,
--#endif
+-                              uintptr_t, uintptr_t);
-+    /* cache the cpu class for the hotpath */
+-
-+    cpu->cc = CPU_GET_CLASS(cpu);
+ #define TCG_TARGET_NEED_LDST_LABELS
+ #define TCG_TARGET_NEED_POOL_LABELS
-     cpu_list_add(cpu);
-     if (!accel_cpu_realizefn(cpu, errp)) {
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
-@@ -XXX,XX +XXX,XX @@ void cpu_exec_realizefn(CPUState *cpu, Error **errp)
+index XXXXXXX..XXXXXXX 100644
-     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
+--- a/tcg/arm/tcg-target.h
-         vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
++++ b/tcg/arm/tcg-target.h
-     }
+@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
--    if (cc->sysemu_ops->legacy_vmsd != NULL) {
--        vmstate_register(NULL, cpu->cpu_index, cc->sysemu_ops->legacy_vmsd, cpu);
+ #define TCG_TARGET_DEFAULT_MO (0)
-+    if (cpu->cc->sysemu_ops->legacy_vmsd != NULL) {
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     0
-+        vmstate_register(NULL, cpu->cpu_index, cpu->cc->sysemu_ops->legacy_vmsd, cpu);
+-
-     }
+-/* not defined -- call should be eliminated at compile time */
- #endif /* CONFIG_USER_ONLY */
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
- }
+-                              uintptr_t, uintptr_t);
 -
  #define TCG_TARGET_NEED_LDST_LABELS
  #define TCG_TARGET_NEED_POOL_LABELS
 diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.h
 +++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
  #define TCG_TARGET_extract_i64_valid(ofs, len) \
      (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
 -void tb_target_set_jmp_target(const TranslationBlock *, int,
 -                              uintptr_t, uintptr_t);
 -
  /* This defines the natural memory order supported by this
   * architecture before guarantees made by various barrier
   * instructions.
 diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.h
 +++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_muluh_i64        1
  #define TCG_TARGET_HAS_mulsh_i64        1
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t, uintptr_t);
 -
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_NEED_LDST_LABELS
 diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.h
 +++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 -/* not defined -- call should be eliminated at compile time */
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t, uintptr_t)
 -    QEMU_ERROR("code path is reachable");
 -
  #define TCG_TARGET_NEED_LDST_LABELS
  #endif
 diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.h
 +++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
  #define TCG_TARGET_HAS_bitsel_vec       have_vsx
  #define TCG_TARGET_HAS_cmpsel_vec       0
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t, uintptr_t);
 -
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 -
  #define TCG_TARGET_NEED_LDST_LABELS
  #define TCG_TARGET_NEED_POOL_LABELS
 diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.h
 +++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_mulsh_i64        1
  #endif
 -/* not defined -- call should be eliminated at compile time */
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t, uintptr_t);
 -
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_NEED_LDST_LABELS
 diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.h
 +++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
  #define TCG_TARGET_HAS_MEMORY_BSWAP   1
  #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
 -
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t jmp_rx, uintptr_t jmp_rw);
 -
  #define TCG_TARGET_NEED_LDST_LABELS
  #define TCG_TARGET_NEED_POOL_LABELS
 diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.h
 +++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 -
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t, uintptr_t);
 -
  #define TCG_TARGET_NEED_POOL_LABELS
  #endif
 diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.h
 +++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 -/* not defined -- call should be eliminated at compile time */
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t, uintptr_t);
 -
  #endif /* TCG_TARGET_H */
 --
 .34.1

-[PULL 16/20] hw/core: Add CPUClass.get_pc
+[PULL 14/22] tcg: Always define tb_target_set_jmp_target
-Populate this new method for all targets.  Always match
+Install empty versions for !TCG_TARGET_HAS_direct_jump hosts.
 the result that would be given by cpu_get_tb_cpu_state,
 as we will want these values to correspond in the logs.
-Reviewed-by: Taylor Simpson <tsimpson@quicinc.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> (target/sparc)
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
-Cc: Eduardo Habkost <eduardo@habkost.net> (supporter:Machine core)
+ tcg/arm/tcg-target.c.inc   | 6 ++++++
-Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com> (supporter:Machine core)
+ tcg/mips/tcg-target.c.inc  | 6 ++++++
-Cc: "Philippe Mathieu-Daudé" <f4bug@amsat.org> (reviewer:Machine core)
+ tcg/riscv/tcg-target.c.inc | 6 ++++++
-Cc: Yanan Wang <wangyanan55@huawei.com> (reviewer:Machine core)
+ tcg/tci/tcg-target.c.inc   | 6 ++++++
-Cc: Michael Rolnik <mrolnik@gmail.com> (maintainer:AVR TCG CPUs)
+files changed, 24 insertions(+)
 Cc: "Edgar E. Iglesias" <edgar.iglesias@gmail.com> (maintainer:CRIS TCG CPUs)
 Cc: Taylor Simpson <tsimpson@quicinc.com> (supporter:Hexagon TCG CPUs)
 Cc: Song Gao <gaosong@loongson.cn> (maintainer:LoongArch TCG CPUs)
 Cc: Xiaojuan Yang <yangxiaojuan@loongson.cn> (maintainer:LoongArch TCG CPUs)
 Cc: Laurent Vivier <laurent@vivier.eu> (maintainer:M68K TCG CPUs)
 Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> (reviewer:MIPS TCG CPUs)
 Cc: Aleksandar Rikalo <aleksandar.rikalo@syrmia.com> (reviewer:MIPS TCG CPUs)
 Cc: Chris Wulff <crwulff@gmail.com> (maintainer:NiosII TCG CPUs)
 Cc: Marek Vasut <marex@denx.de> (maintainer:NiosII TCG CPUs)
 Cc: Stafford Horne <shorne@gmail.com> (odd fixer:OpenRISC TCG CPUs)
 Cc: Yoshinori Sato <ysato@users.sourceforge.jp> (reviewer:RENESAS RX CPUs)
 Cc: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> (maintainer:SPARC TCG CPUs)
 Cc: Bastian Koppelmann <kbastian@mail.uni-paderborn.de> (maintainer:TriCore TCG CPUs)
 Cc: Max Filippov <jcmvbkbc@gmail.com> (maintainer:Xtensa TCG CPUs)
 Cc: qemu-arm@nongnu.org (open list:ARM TCG CPUs)
 Cc: qemu-ppc@nongnu.org (open list:PowerPC TCG CPUs)
 Cc: qemu-riscv@nongnu.org (open list:RISC-V TCG CPUs)
 Cc: qemu-s390x@nongnu.org (open list:S390 TCG CPUs)
 ---
  include/hw/core/cpu.h   |  3 +++
  target/alpha/cpu.c      |  9 +++++++++
  target/arm/cpu.c        | 13 +++++++++++++
  target/avr/cpu.c        |  8 ++++++++
  target/cris/cpu.c       |  8 ++++++++
  target/hexagon/cpu.c    |  8 ++++++++
  target/hppa/cpu.c       |  8 ++++++++
  target/i386/cpu.c       |  9 +++++++++
  target/loongarch/cpu.c  |  9 +++++++++
  target/m68k/cpu.c       |  8 ++++++++
  target/microblaze/cpu.c |  8 ++++++++
  target/mips/cpu.c       |  8 ++++++++
  target/nios2/cpu.c      |  9 +++++++++
  target/openrisc/cpu.c   |  8 ++++++++
  target/ppc/cpu_init.c   |  8 ++++++++
  target/riscv/cpu.c      | 13 +++++++++++++
  target/rx/cpu.c         |  8 ++++++++
  target/s390x/cpu.c      |  8 ++++++++
  target/sh4/cpu.c        |  8 ++++++++
  target/sparc/cpu.c      |  8 ++++++++
  target/tricore/cpu.c    |  9 +++++++++
  target/xtensa/cpu.c     |  8 ++++++++
 files changed, 186 insertions(+)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/arm/tcg-target.c.inc
-+++ b/include/hw/core/cpu.h
++++ b/tcg/arm/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ struct SysemuCPUOps;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
-  *       If the target behaviour here is anything other than "set
+     set_jmp_reset_offset(s, which);
   *       the PC register to the value passed in" then the target must
   *       also implement the synchronize_from_tb hook.
 + * @get_pc: Callback for getting the Program Counter register.
 + *       As above, with the semantics of the target architecture.
   * @gdb_read_register: Callback for letting GDB read a register.
   * @gdb_write_register: Callback for letting GDB write a register.
   * @gdb_adjust_breakpoint: Callback for adjusting the address of a
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
      void (*dump_state)(CPUState *cpu, FILE *, int flags);
      int64_t (*get_arch_id)(CPUState *cpu);
      void (*set_pc)(CPUState *cpu, vaddr value);
 +    vaddr (*get_pc)(CPUState *cpu);
      int (*gdb_read_register)(CPUState *cpu, GByteArray *buf, int reg);
      int (*gdb_write_register)(CPUState *cpu, uint8_t *buf, int reg);
      vaddr (*gdb_adjust_breakpoint)(CPUState *cpu, vaddr addr);
 diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/alpha/cpu.c
 +++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_set_pc(CPUState *cs, vaddr value)
      cpu->env.pc = value;
  }
-+static vaddr alpha_cpu_get_pc(CPUState *cs)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
-+    AlphaCPU *cpu = ALPHA_CPU(cs);
++    /* Always indirect, nothing to do */
 +
 +    return cpu->env.pc;
 +}
 +
-+
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
- static bool alpha_cpu_has_work(CPUState *cs)
+                        const TCGArg args[TCG_MAX_OP_ARGS],
- {
+                        const int const_args[TCG_MAX_OP_ARGS])
-     /* Here we are checking to see if the CPU should wake up from HALT.
+diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
      cc->has_work = alpha_cpu_has_work;
      cc->dump_state = alpha_cpu_dump_state;
      cc->set_pc = alpha_cpu_set_pc;
 +    cc->get_pc = alpha_cpu_get_pc;
      cc->gdb_read_register = alpha_cpu_gdb_read_register;
      cc->gdb_write_register = alpha_cpu_gdb_write_register;
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/arm/cpu.c b/target/arm/cpu.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/cpu.c
+--- a/tcg/mips/tcg-target.c.inc
-+++ b/target/arm/cpu.c
++++ b/tcg/mips/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void arm_cpu_set_pc(CPUState *cs, vaddr value)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
-     }
+     set_jmp_reset_offset(s, which);
  }
-+static vaddr arm_cpu_get_pc(CPUState *cs)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
-+    ARMCPU *cpu = ARM_CPU(cs);
++    /* Always indirect, nothing to do */
 +    CPUARMState *env = &cpu->env;
 +
 +    if (is_a64(env)) {
 +        return env->pc;
 +    } else {
 +        return env->regs[15];
 +    }
 +}
 +
- #ifdef CONFIG_TCG
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
- void arm_cpu_synchronize_from_tb(CPUState *cs,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
-                                  const TranslationBlock *tb)
+                        const int const_args[TCG_MAX_OP_ARGS])
-@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
      cc->has_work = arm_cpu_has_work;
      cc->dump_state = arm_cpu_dump_state;
      cc->set_pc = arm_cpu_set_pc;
 +    cc->get_pc = arm_cpu_get_pc;
      cc->gdb_read_register = arm_cpu_gdb_read_register;
      cc->gdb_write_register = arm_cpu_gdb_write_register;
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/avr/cpu.c b/target/avr/cpu.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/avr/cpu.c
+--- a/tcg/riscv/tcg-target.c.inc
-+++ b/target/avr/cpu.c
++++ b/tcg/riscv/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void avr_cpu_set_pc(CPUState *cs, vaddr value)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
-     cpu->env.pc_w = value / 2; /* internally PC points to words */
+     set_jmp_reset_offset(s, which);
  }
-+static vaddr avr_cpu_get_pc(CPUState *cs)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
-+    AVRCPU *cpu = AVR_CPU(cs);
++    /* Always indirect, nothing to do */
 +
 +    return cpu->env.pc_w * 2;
 +}
 +
- static bool avr_cpu_has_work(CPUState *cs)
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
- {
+                        const TCGArg args[TCG_MAX_OP_ARGS],
-     AVRCPU *cpu = AVR_CPU(cs);
+                        const int const_args[TCG_MAX_OP_ARGS])
-@@ -XXX,XX +XXX,XX @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
      cc->has_work = avr_cpu_has_work;
      cc->dump_state = avr_cpu_dump_state;
      cc->set_pc = avr_cpu_set_pc;
 +    cc->get_pc = avr_cpu_get_pc;
      dc->vmsd = &vms_avr_cpu;
      cc->sysemu_ops = &avr_sysemu_ops;
      cc->disas_set_info = avr_cpu_disas_set_info;
 diff --git a/target/cris/cpu.c b/target/cris/cpu.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/cris/cpu.c
+--- a/tcg/tci/tcg-target.c.inc
-+++ b/target/cris/cpu.c
++++ b/tcg/tci/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void cris_cpu_set_pc(CPUState *cs, vaddr value)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
-     cpu->env.pc = value;
+     set_jmp_reset_offset(s, which);
  }
-+static vaddr cris_cpu_get_pc(CPUState *cs)
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
-+    CRISCPU *cpu = CRIS_CPU(cs);
++    /* Always indirect, nothing to do */
 +
 +    return cpu->env.pc;
 +}
 +
- static bool cris_cpu_has_work(CPUState *cs)
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
- {
+                        const TCGArg args[TCG_MAX_OP_ARGS],
-     return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI);
+                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
      cc->has_work = cris_cpu_has_work;
      cc->dump_state = cris_cpu_dump_state;
      cc->set_pc = cris_cpu_set_pc;
 +    cc->get_pc = cris_cpu_get_pc;
      cc->gdb_read_register = cris_cpu_gdb_read_register;
      cc->gdb_write_register = cris_cpu_gdb_write_register;
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hexagon/cpu.c
 +++ b/target/hexagon/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hexagon_cpu_set_pc(CPUState *cs, vaddr value)
      env->gpr[HEX_REG_PC] = value;
  }
 +static vaddr hexagon_cpu_get_pc(CPUState *cs)
 +{
 +    HexagonCPU *cpu = HEXAGON_CPU(cs);
 +    CPUHexagonState *env = &cpu->env;
 +    return env->gpr[HEX_REG_PC];
 +}
 +
  static void hexagon_cpu_synchronize_from_tb(CPUState *cs,
                                              const TranslationBlock *tb)
  {
@@ -XXX,XX +XXX,XX @@ static void hexagon_cpu_class_init(ObjectClass *c, void *data)
      cc->has_work = hexagon_cpu_has_work;
      cc->dump_state = hexagon_dump_state;
      cc->set_pc = hexagon_cpu_set_pc;
 +    cc->get_pc = hexagon_cpu_get_pc;
      cc->gdb_read_register = hexagon_gdb_read_register;
      cc->gdb_write_register = hexagon_gdb_write_register;
      cc->gdb_num_core_regs = TOTAL_PER_THREAD_REGS + NUM_VREGS + NUM_QREGS;
 diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/cpu.c
 +++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_set_pc(CPUState *cs, vaddr value)
      cpu->env.iaoq_b = value + 4;
  }
 +static vaddr hppa_cpu_get_pc(CPUState *cs)
 +{
 +    HPPACPU *cpu = HPPA_CPU(cs);
 +
 +    return cpu->env.iaoq_f;
 +}
 +
  static void hppa_cpu_synchronize_from_tb(CPUState *cs,
                                           const TranslationBlock *tb)
  {
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
      cc->has_work = hppa_cpu_has_work;
      cc->dump_state = hppa_cpu_dump_state;
      cc->set_pc = hppa_cpu_set_pc;
 +    cc->get_pc = hppa_cpu_get_pc;
      cc->gdb_read_register = hppa_cpu_gdb_read_register;
      cc->gdb_write_register = hppa_cpu_gdb_write_register;
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/i386/cpu.c b/target/i386/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/cpu.c
 +++ b/target/i386/cpu.c
@@ -XXX,XX +XXX,XX @@ static void x86_cpu_set_pc(CPUState *cs, vaddr value)
      cpu->env.eip = value;
  }
 +static vaddr x86_cpu_get_pc(CPUState *cs)
 +{
 +    X86CPU *cpu = X86_CPU(cs);
 +
 +    /* Match cpu_get_tb_cpu_state. */
 +    return cpu->env.eip + cpu->env.segs[R_CS].base;
 +}
 +
  int x86_cpu_pending_interrupt(CPUState *cs, int interrupt_request)
  {
      X86CPU *cpu = X86_CPU(cs);
@@ -XXX,XX +XXX,XX @@ static void x86_cpu_common_class_init(ObjectClass *oc, void *data)
      cc->has_work = x86_cpu_has_work;
      cc->dump_state = x86_cpu_dump_state;
      cc->set_pc = x86_cpu_set_pc;
 +    cc->get_pc = x86_cpu_get_pc;
      cc->gdb_read_register = x86_cpu_gdb_read_register;
      cc->gdb_write_register = x86_cpu_gdb_write_register;
      cc->get_arch_id = x86_cpu_get_arch_id;
 diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/loongarch/cpu.c
 +++ b/target/loongarch/cpu.c
@@ -XXX,XX +XXX,XX @@ static void loongarch_cpu_set_pc(CPUState *cs, vaddr value)
      env->pc = value;
  }
 +static vaddr loongarch_cpu_get_pc(CPUState *cs)
 +{
 +    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
 +    CPULoongArchState *env = &cpu->env;
 +
 +    return env->pc;
 +}
 +
  #ifndef CONFIG_USER_ONLY
  #include "hw/loongarch/virt.h"
@@ -XXX,XX +XXX,XX @@ static void loongarch_cpu_class_init(ObjectClass *c, void *data)
      cc->has_work = loongarch_cpu_has_work;
      cc->dump_state = loongarch_cpu_dump_state;
      cc->set_pc = loongarch_cpu_set_pc;
 +    cc->get_pc = loongarch_cpu_get_pc;
  #ifndef CONFIG_USER_ONLY
      dc->vmsd = &vmstate_loongarch_cpu;
      cc->sysemu_ops = &loongarch_sysemu_ops;
 diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/cpu.c
 +++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_set_pc(CPUState *cs, vaddr value)
      cpu->env.pc = value;
  }
 +static vaddr m68k_cpu_get_pc(CPUState *cs)
 +{
 +    M68kCPU *cpu = M68K_CPU(cs);
 +
 +    return cpu->env.pc;
 +}
 +
  static bool m68k_cpu_has_work(CPUState *cs)
  {
      return cs->interrupt_request & CPU_INTERRUPT_HARD;
@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
      cc->has_work = m68k_cpu_has_work;
      cc->dump_state = m68k_cpu_dump_state;
      cc->set_pc = m68k_cpu_set_pc;
 +    cc->get_pc = m68k_cpu_get_pc;
      cc->gdb_read_register = m68k_cpu_gdb_read_register;
      cc->gdb_write_register = m68k_cpu_gdb_write_register;
  #if defined(CONFIG_SOFTMMU)
 diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/cpu.c
 +++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_set_pc(CPUState *cs, vaddr value)
      cpu->env.iflags = 0;
  }
 +static vaddr mb_cpu_get_pc(CPUState *cs)
 +{
 +    MicroBlazeCPU *cpu = MICROBLAZE_CPU(cs);
 +
 +    return cpu->env.pc;
 +}
 +
  static void mb_cpu_synchronize_from_tb(CPUState *cs,
                                         const TranslationBlock *tb)
  {
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
      cc->dump_state = mb_cpu_dump_state;
      cc->set_pc = mb_cpu_set_pc;
 +    cc->get_pc = mb_cpu_get_pc;
      cc->gdb_read_register = mb_cpu_gdb_read_register;
      cc->gdb_write_register = mb_cpu_gdb_write_register;
 diff --git a/target/mips/cpu.c b/target/mips/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/cpu.c
 +++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_set_pc(CPUState *cs, vaddr value)
      mips_env_set_pc(&cpu->env, value);
  }
 +static vaddr mips_cpu_get_pc(CPUState *cs)
 +{
 +    MIPSCPU *cpu = MIPS_CPU(cs);
 +
 +    return cpu->env.active_tc.PC;
 +}
 +
  static bool mips_cpu_has_work(CPUState *cs)
  {
      MIPSCPU *cpu = MIPS_CPU(cs);
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
      cc->has_work = mips_cpu_has_work;
      cc->dump_state = mips_cpu_dump_state;
      cc->set_pc = mips_cpu_set_pc;
 +    cc->get_pc = mips_cpu_get_pc;
      cc->gdb_read_register = mips_cpu_gdb_read_register;
      cc->gdb_write_register = mips_cpu_gdb_write_register;
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/nios2/cpu.c
 +++ b/target/nios2/cpu.c
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_set_pc(CPUState *cs, vaddr value)
      env->pc = value;
  }
 +static vaddr nios2_cpu_get_pc(CPUState *cs)
 +{
 +    Nios2CPU *cpu = NIOS2_CPU(cs);
 +    CPUNios2State *env = &cpu->env;
 +
 +    return env->pc;
 +}
 +
  static bool nios2_cpu_has_work(CPUState *cs)
  {
      return cs->interrupt_request & CPU_INTERRUPT_HARD;
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
      cc->has_work = nios2_cpu_has_work;
      cc->dump_state = nios2_cpu_dump_state;
      cc->set_pc = nios2_cpu_set_pc;
 +    cc->get_pc = nios2_cpu_get_pc;
      cc->disas_set_info = nios2_cpu_disas_set_info;
  #ifndef CONFIG_USER_ONLY
      cc->sysemu_ops = &nios2_sysemu_ops;
 diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/openrisc/cpu.c
 +++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_set_pc(CPUState *cs, vaddr value)
      cpu->env.dflag = 0;
  }
 +static vaddr openrisc_cpu_get_pc(CPUState *cs)
 +{
 +    OpenRISCCPU *cpu = OPENRISC_CPU(cs);
 +
 +    return cpu->env.pc;
 +}
 +
  static void openrisc_cpu_synchronize_from_tb(CPUState *cs,
                                               const TranslationBlock *tb)
  {
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
      cc->has_work = openrisc_cpu_has_work;
      cc->dump_state = openrisc_cpu_dump_state;
      cc->set_pc = openrisc_cpu_set_pc;
 +    cc->get_pc = openrisc_cpu_get_pc;
      cc->gdb_read_register = openrisc_cpu_gdb_read_register;
      cc->gdb_write_register = openrisc_cpu_gdb_write_register;
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/cpu_init.c
 +++ b/target/ppc/cpu_init.c
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_set_pc(CPUState *cs, vaddr value)
      cpu->env.nip = value;
  }
 +static vaddr ppc_cpu_get_pc(CPUState *cs)
 +{
 +    PowerPCCPU *cpu = POWERPC_CPU(cs);
 +
 +    return cpu->env.nip;
 +}
 +
  static bool ppc_cpu_has_work(CPUState *cs)
  {
      PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
      cc->has_work = ppc_cpu_has_work;
      cc->dump_state = ppc_cpu_dump_state;
      cc->set_pc = ppc_cpu_set_pc;
 +    cc->get_pc = ppc_cpu_get_pc;
      cc->gdb_read_register = ppc_cpu_gdb_read_register;
      cc->gdb_write_register = ppc_cpu_gdb_write_register;
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/cpu.c
 +++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_set_pc(CPUState *cs, vaddr value)
      }
  }
 +static vaddr riscv_cpu_get_pc(CPUState *cs)
 +{
 +    RISCVCPU *cpu = RISCV_CPU(cs);
 +    CPURISCVState *env = &cpu->env;
 +
 +    /* Match cpu_get_tb_cpu_state. */
 +    if (env->xl == MXL_RV32) {
 +        return env->pc & UINT32_MAX;
 +    }
 +    return env->pc;
 +}
 +
  static void riscv_cpu_synchronize_from_tb(CPUState *cs,
                                            const TranslationBlock *tb)
  {
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
      cc->has_work = riscv_cpu_has_work;
      cc->dump_state = riscv_cpu_dump_state;
      cc->set_pc = riscv_cpu_set_pc;
 +    cc->get_pc = riscv_cpu_get_pc;
      cc->gdb_read_register = riscv_cpu_gdb_read_register;
      cc->gdb_write_register = riscv_cpu_gdb_write_register;
      cc->gdb_num_core_regs = 33;
 diff --git a/target/rx/cpu.c b/target/rx/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/cpu.c
 +++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_set_pc(CPUState *cs, vaddr value)
      cpu->env.pc = value;
  }
 +static vaddr rx_cpu_get_pc(CPUState *cs)
 +{
 +    RXCPU *cpu = RX_CPU(cs);
 +
 +    return cpu->env.pc;
 +}
 +
  static void rx_cpu_synchronize_from_tb(CPUState *cs,
                                         const TranslationBlock *tb)
  {
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
      cc->has_work = rx_cpu_has_work;
      cc->dump_state = rx_cpu_dump_state;
      cc->set_pc = rx_cpu_set_pc;
 +    cc->get_pc = rx_cpu_get_pc;
  #ifndef CONFIG_USER_ONLY
      cc->sysemu_ops = &rx_sysemu_ops;
 diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/cpu.c
 +++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_set_pc(CPUState *cs, vaddr value)
      cpu->env.psw.addr = value;
  }
 +static vaddr s390_cpu_get_pc(CPUState *cs)
 +{
 +    S390CPU *cpu = S390_CPU(cs);
 +
 +    return cpu->env.psw.addr;
 +}
 +
  static bool s390_cpu_has_work(CPUState *cs)
  {
      S390CPU *cpu = S390_CPU(cs);
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
      cc->has_work = s390_cpu_has_work;
      cc->dump_state = s390_cpu_dump_state;
      cc->set_pc = s390_cpu_set_pc;
 +    cc->get_pc = s390_cpu_get_pc;
      cc->gdb_read_register = s390_cpu_gdb_read_register;
      cc->gdb_write_register = s390_cpu_gdb_write_register;
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/cpu.c
 +++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_set_pc(CPUState *cs, vaddr value)
      cpu->env.pc = value;
  }
 +static vaddr superh_cpu_get_pc(CPUState *cs)
 +{
 +    SuperHCPU *cpu = SUPERH_CPU(cs);
 +
 +    return cpu->env.pc;
 +}
 +
  static void superh_cpu_synchronize_from_tb(CPUState *cs,
                                             const TranslationBlock *tb)
  {
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
      cc->has_work = superh_cpu_has_work;
      cc->dump_state = superh_cpu_dump_state;
      cc->set_pc = superh_cpu_set_pc;
 +    cc->get_pc = superh_cpu_get_pc;
      cc->gdb_read_register = superh_cpu_gdb_read_register;
      cc->gdb_write_register = superh_cpu_gdb_write_register;
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/cpu.c
 +++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_set_pc(CPUState *cs, vaddr value)
      cpu->env.npc = value + 4;
  }
 +static vaddr sparc_cpu_get_pc(CPUState *cs)
 +{
 +    SPARCCPU *cpu = SPARC_CPU(cs);
 +
 +    return cpu->env.pc;
 +}
 +
  static void sparc_cpu_synchronize_from_tb(CPUState *cs,
                                            const TranslationBlock *tb)
  {
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
      cc->memory_rw_debug = sparc_cpu_memory_rw_debug;
  #endif
      cc->set_pc = sparc_cpu_set_pc;
 +    cc->get_pc = sparc_cpu_get_pc;
      cc->gdb_read_register = sparc_cpu_gdb_read_register;
      cc->gdb_write_register = sparc_cpu_gdb_write_register;
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tricore/cpu.c
 +++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_set_pc(CPUState *cs, vaddr value)
      env->PC = value & ~(target_ulong)1;
  }
 +static vaddr tricore_cpu_get_pc(CPUState *cs)
 +{
 +    TriCoreCPU *cpu = TRICORE_CPU(cs);
 +    CPUTriCoreState *env = &cpu->env;
 +
 +    return env->PC;
 +}
 +
  static void tricore_cpu_synchronize_from_tb(CPUState *cs,
                                              const TranslationBlock *tb)
  {
@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_class_init(ObjectClass *c, void *data)
      cc->dump_state = tricore_cpu_dump_state;
      cc->set_pc = tricore_cpu_set_pc;
 +    cc->get_pc = tricore_cpu_get_pc;
      cc->sysemu_ops = &tricore_sysemu_ops;
      cc->tcg_ops = &tricore_tcg_ops;
  }
 diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/xtensa/cpu.c
 +++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_set_pc(CPUState *cs, vaddr value)
      cpu->env.pc = value;
  }
 +static vaddr xtensa_cpu_get_pc(CPUState *cs)
 +{
 +    XtensaCPU *cpu = XTENSA_CPU(cs);
 +
 +    return cpu->env.pc;
 +}
 +
  static bool xtensa_cpu_has_work(CPUState *cs)
  {
  #ifndef CONFIG_USER_ONLY
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
      cc->has_work = xtensa_cpu_has_work;
      cc->dump_state = xtensa_cpu_dump_state;
      cc->set_pc = xtensa_cpu_set_pc;
 +    cc->get_pc = xtensa_cpu_get_pc;
      cc->gdb_read_register = xtensa_cpu_gdb_read_register;
      cc->gdb_write_register = xtensa_cpu_gdb_write_register;
      cc->gdb_stop_before_watchpoint = true;
 --
 .34.1

-[PULL 15/20] include/hw/core: Create struct CPUJumpCache
+[PULL 15/22] tcg: Remove TCG_TARGET_HAS_direct_jump
-Wrap the bare TranslationBlock pointer into a structure.
+We now have the option to generate direct or indirect
 goto_tb depending on the dynamic displacement, thus
 the define is no longer necessary or completely accurate.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tb-hash.h       |  1 +
+ tcg/aarch64/tcg-target.h     |  1 -
- accel/tcg/tb-jmp-cache.h  | 24 ++++++++++++++++++++++++
+ tcg/arm/tcg-target.h         |  1 -
- include/exec/cpu-common.h |  1 +
+ tcg/i386/tcg-target.h        |  1 -
- include/hw/core/cpu.h     | 15 +--------------
+ tcg/loongarch64/tcg-target.h |  1 -
- include/qemu/typedefs.h   |  1 +
+ tcg/mips/tcg-target.h        |  1 -
- accel/stubs/tcg-stub.c    |  4 ++++
+ tcg/ppc/tcg-target.h         |  1 -
- accel/tcg/cpu-exec.c      | 10 +++++++---
+ tcg/riscv/tcg-target.h       |  1 -
- accel/tcg/cputlb.c        |  9 +++++----
+ tcg/s390x/tcg-target.h       |  1 -
- accel/tcg/translate-all.c | 28 +++++++++++++++++++++++++---
+ tcg/sparc64/tcg-target.h     |  1 -
- hw/core/cpu-common.c      |  3 +--
+ tcg/tci/tcg-target.h         |  1 -
- plugins/core.c            |  2 +-
+ accel/tcg/cpu-exec.c         | 23 +++++++++++------------
- trace/control-target.c    |  2 +-
+ tcg/tcg.c                    |  1 -
-files changed, 72 insertions(+), 28 deletions(-)
+ tcg/arm/tcg-target.c.inc     |  1 -
- create mode 100644 accel/tcg/tb-jmp-cache.h
+ tcg/mips/tcg-target.c.inc    |  1 -
+ tcg/riscv/tcg-target.c.inc   |  1 -
-diff --git a/accel/tcg/tb-hash.h b/accel/tcg/tb-hash.h
+ tcg/s390x/tcg-target.c.inc   |  3 +++
-index XXXXXXX..XXXXXXX 100644
+ tcg/tci/tcg-target.c.inc     |  1 -
---- a/accel/tcg/tb-hash.h
+files changed, 14 insertions(+), 27 deletions(-)
-+++ b/accel/tcg/tb-hash.h
 diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.h
 +++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_muls2_i64        0
  #define TCG_TARGET_HAS_muluh_i64        1
  #define TCG_TARGET_HAS_mulsh_i64        1
 -#define TCG_TARGET_HAS_direct_jump      1
  #define TCG_TARGET_HAS_v64              1
  #define TCG_TARGET_HAS_v128             1
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
  #define TCG_TARGET_HAS_mulsh_i32        0
  #define TCG_TARGET_HAS_div_i32          use_idiv_instructions
  #define TCG_TARGET_HAS_rem_i32          0
 -#define TCG_TARGET_HAS_direct_jump      0
  #define TCG_TARGET_HAS_qemu_st8_i32     0
  #define TCG_TARGET_HAS_v64              use_neon_instructions
 diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.h
 +++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
  #define TCG_TARGET_HAS_muls2_i32        1
  #define TCG_TARGET_HAS_muluh_i32        0
  #define TCG_TARGET_HAS_mulsh_i32        0
 -#define TCG_TARGET_HAS_direct_jump      1
  #if TCG_TARGET_REG_BITS == 64
  /* Keep target addresses zero-extended in a register.  */
 diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.h
 +++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_clz_i32          1
  #define TCG_TARGET_HAS_ctz_i32          1
  #define TCG_TARGET_HAS_ctpop_i32        0
 -#define TCG_TARGET_HAS_direct_jump      1
  #define TCG_TARGET_HAS_brcond2          0
  #define TCG_TARGET_HAS_setcond2         0
  #define TCG_TARGET_HAS_qemu_st8_i32     0
 diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.h
 +++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
  #define TCG_TARGET_HAS_muluh_i32        1
  #define TCG_TARGET_HAS_mulsh_i32        1
  #define TCG_TARGET_HAS_bswap32_i32      1
 -#define TCG_TARGET_HAS_direct_jump      0
  #if TCG_TARGET_REG_BITS == 64
  #define TCG_TARGET_HAS_add2_i32         0
 diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.h
 +++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
  #define TCG_TARGET_HAS_muls2_i32        0
  #define TCG_TARGET_HAS_muluh_i32        1
  #define TCG_TARGET_HAS_mulsh_i32        1
 -#define TCG_TARGET_HAS_direct_jump      1
  #define TCG_TARGET_HAS_qemu_st8_i32     0
  #if TCG_TARGET_REG_BITS == 64
 diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.h
 +++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_clz_i32          0
  #define TCG_TARGET_HAS_ctz_i32          0
  #define TCG_TARGET_HAS_ctpop_i32        0
 -#define TCG_TARGET_HAS_direct_jump      0
  #define TCG_TARGET_HAS_brcond2          1
  #define TCG_TARGET_HAS_setcond2         1
  #define TCG_TARGET_HAS_qemu_st8_i32     0
 diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.h
 +++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
  #define TCG_TARGET_HAS_mulsh_i32      0
  #define TCG_TARGET_HAS_extrl_i64_i32  0
  #define TCG_TARGET_HAS_extrh_i64_i32  0
 -#define TCG_TARGET_HAS_direct_jump    1
  #define TCG_TARGET_HAS_qemu_st8_i32   0
  #define TCG_TARGET_HAS_div2_i64       1
 diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.h
 +++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
  #define TCG_TARGET_HAS_muls2_i32        1
  #define TCG_TARGET_HAS_muluh_i32        0
  #define TCG_TARGET_HAS_mulsh_i32        0
 -#define TCG_TARGET_HAS_direct_jump      1
  #define TCG_TARGET_HAS_qemu_st8_i32     0
  #define TCG_TARGET_HAS_extrl_i64_i32    1
 diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.h
 +++ b/tcg/tci/tcg-target.h
 @@ -XXX,XX +XXX,XX @@
- #include "exec/cpu-defs.h"
+ #define TCG_TARGET_HAS_muls2_i32        1
- #include "exec/exec-all.h"
+ #define TCG_TARGET_HAS_muluh_i32        0
- #include "qemu/xxhash.h"
+ #define TCG_TARGET_HAS_mulsh_i32        0
-+#include "tb-jmp-cache.h"
+-#define TCG_TARGET_HAS_direct_jump      0
+ #define TCG_TARGET_HAS_qemu_st8_i32     0
- #ifdef CONFIG_SOFTMMU
+ #if TCG_TARGET_REG_BITS == 64
 diff --git a/accel/tcg/tb-jmp-cache.h b/accel/tcg/tb-jmp-cache.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tb-jmp-cache.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * The per-CPU TranslationBlock jump cache.
 + *
 + *  Copyright (c) 2003 Fabrice Bellard
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#ifndef ACCEL_TCG_TB_JMP_CACHE_H
 +#define ACCEL_TCG_TB_JMP_CACHE_H
 +
 +#define TB_JMP_CACHE_BITS 12
 +#define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
 +
 +/*
 + * Accessed in parallel; all accesses to 'tb' must be atomic.
 + */
 +struct CPUJumpCache {
 +    struct {
 +        TranslationBlock *tb;
 +    } array[TB_JMP_CACHE_SIZE];
 +};
 +
 +#endif /* ACCEL_TCG_TB_JMP_CACHE_H */
 diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/cpu-common.h
 +++ b/include/exec/cpu-common.h
@@ -XXX,XX +XXX,XX @@ void cpu_list_unlock(void);
  unsigned int cpu_list_generation_id_get(void);
  void tcg_flush_softmmu_tlb(CPUState *cs);
 +void tcg_flush_jmp_cache(CPUState *cs);
  void tcg_iommu_init_notifier_list(CPUState *cpu);
  void tcg_iommu_free_notifier_list(CPUState *cpu);
 diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/core/cpu.h
 +++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ struct kvm_run;
  struct hax_vcpu_state;
  struct hvf_vcpu_state;
 -#define TB_JMP_CACHE_BITS 12
 -#define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
 -
  /* work queue */
  /* The union type allows passing of 64 bit target pointers on 32 bit
@@ -XXX,XX +XXX,XX @@ struct CPUState {
      CPUArchState *env_ptr;
      IcountDecr *icount_decr_ptr;
 -    /* Accessed in parallel; all accesses must be atomic */
 -    TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE];
 +    CPUJumpCache *tb_jmp_cache;
      struct GDBRegisterState *gdb_regs;
      int gdb_num_regs;
@@ -XXX,XX +XXX,XX @@ extern CPUTailQ cpus;
  extern __thread CPUState *current_cpu;
 -static inline void cpu_tb_jmp_cache_clear(CPUState *cpu)
 -{
 -    unsigned int i;
 -
 -    for (i = 0; i < TB_JMP_CACHE_SIZE; i++) {
 -        qatomic_set(&cpu->tb_jmp_cache[i], NULL);
 -    }
 -}
 -
  /**
   * qemu_tcg_mttcg_enabled:
   * Check whether we are running MultiThread TCG or not.
 diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/typedefs.h
 +++ b/include/qemu/typedefs.h
@@ -XXX,XX +XXX,XX @@ typedef struct CoMutex CoMutex;
  typedef struct ConfidentialGuestSupport ConfidentialGuestSupport;
  typedef struct CPUAddressSpace CPUAddressSpace;
  typedef struct CPUArchState CPUArchState;
 +typedef struct CPUJumpCache CPUJumpCache;
  typedef struct CPUState CPUState;
  typedef struct CPUTLBEntryFull CPUTLBEntryFull;
  typedef struct DeviceListener DeviceListener;
 diff --git a/accel/stubs/tcg-stub.c b/accel/stubs/tcg-stub.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/stubs/tcg-stub.c
 +++ b/accel/stubs/tcg-stub.c
@@ -XXX,XX +XXX,XX @@ void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
  {
  }
 +void tcg_flush_jmp_cache(CPUState *cpu)
 +{
 +}
 +
  int probe_access_flags(CPUArchState *env, target_ulong addr,
                         MMUAccessType access_type, int mmu_idx,
                         bool nonfault, void **phost, uintptr_t retaddr)
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
- #include "sysemu/replay.h"
- #include "sysemu/tcg.h"
+ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
- #include "exec/helper-proto.h"
+ {
-+#include "tb-jmp-cache.h"
++    /*
- #include "tb-hash.h"
++     * Get the rx view of the structure, from which we find the
- #include "tb-context.h"
++     * executable code address, and tb_target_set_jmp_target can
- #include "internal.h"
++     * produce a pc-relative displacement to jmp_target_addr[n].
-@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
++     */
-     tcg_debug_assert(!(cflags & CF_INVALID));
++    const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
++    uintptr_t offset = tb->jmp_insn_offset[n];
-     hash = tb_jmp_cache_hash_func(pc);
++    uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
--    tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
++    uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
-+    tb = qatomic_rcu_read(&cpu->tb_jmp_cache->array[hash].tb);
++
+     tb->jmp_target_addr[n] = addr;
-     if (likely(tb &&
+-    if (TCG_TARGET_HAS_direct_jump) {
-                tb->pc == pc &&
+-        /*
-@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
+-         * Get the rx view of the structure, from which we find the
-     if (tb == NULL) {
+-         * executable code address, and tb_target_set_jmp_target can
-         return NULL;
+-         * produce a pc-relative displacement to jmp_target_addr[n].
-     }
+-         */
--    qatomic_set(&cpu->tb_jmp_cache[hash], tb);
+-        const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
-+    qatomic_set(&cpu->tb_jmp_cache->array[hash].tb, tb);
+-        uintptr_t offset = tb->jmp_insn_offset[n];
-     return tb;
+-        uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
 -        uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
 -        tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
 -    }
 +    tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
  }
-@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
+ static inline void tb_add_jump(TranslationBlock *tb, int n,
+diff --git a/tcg/tcg.c b/tcg/tcg.c
-             tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
+index XXXXXXX..XXXXXXX 100644
-             if (tb == NULL) {
+--- a/tcg/tcg.c
-+                uint32_t h;
++++ b/tcg/tcg.c
-+
+@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
-                 mmap_lock();
+      * We will check for overflow at the end of the opcode loop in
-                 tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
+      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
-                 mmap_unlock();
+      */
-@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
+-    tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
-                  * We add the TB in the virtual pc hash table
+     s->gen_tb->jmp_insn_offset[which] = tcg_current_code_size(s);
                   * for the fast lookup
                   */
 -                qatomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
 +                h = tb_jmp_cache_hash_func(pc);
 +                qatomic_set(&cpu->tb_jmp_cache->array[h].tb, tb);
              }
  #ifndef CONFIG_USER_ONLY
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,
  static void tb_jmp_cache_clear_page(CPUState *cpu, target_ulong page_addr)
  {
 -    unsigned int i, i0 = tb_jmp_cache_hash_page(page_addr);
 +    int i, i0 = tb_jmp_cache_hash_page(page_addr);
 +    CPUJumpCache *jc = cpu->tb_jmp_cache;
      for (i = 0; i < TB_JMP_PAGE_SIZE; i++) {
 -        qatomic_set(&cpu->tb_jmp_cache[i0 + i], NULL);
 +        qatomic_set(&jc->array[i0 + i].tb, NULL);
      }
  }
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
-     qemu_spin_unlock(&env_tlb(env)->c.lock);
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
--    cpu_tb_jmp_cache_clear(cpu);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
-+    tcg_flush_jmp_cache(cpu);
+     intptr_t ptr, dif, dil;
+     TCGReg base = TCG_REG_PC;
-     if (to_clean == ALL_MMUIDX_BITS) {
-         qatomic_set(&env_tlb(env)->c.full_flush_count,
+-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_range_by_mmuidx_async_0(CPUState *cpu,
+     ptr = get_jmp_target_addr(s, which);
-      * longer to clear each entry individually than it will to clear it all.
+     dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
-      */
+     dil = sextract32(dif, 0, 12);
-     if (d.len >= (TARGET_PAGE_SIZE * TB_JMP_CACHE_SIZE)) {
+diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
--        cpu_tb_jmp_cache_clear(cpu);
+index XXXXXXX..XXXXXXX 100644
-+        tcg_flush_jmp_cache(cpu);
+--- a/tcg/mips/tcg-target.c.inc
-         return;
++++ b/tcg/mips/tcg-target.c.inc
-     }
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+ static void tcg_out_goto_tb(TCGContext *s, int which)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+ {
-index XXXXXXX..XXXXXXX 100644
+     /* indirect jump method */
---- a/accel/tcg/translate-all.c
+-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-+++ b/accel/tcg/translate-all.c
+     tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
-@@ -XXX,XX +XXX,XX @@
+                get_jmp_target_addr(s, which));
- #include "sysemu/tcg.h"
+     tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
- #include "qapi/error.h"
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
- #include "hw/core/tcg-cpu-ops.h"
+index XXXXXXX..XXXXXXX 100644
-+#include "tb-jmp-cache.h"
+--- a/tcg/riscv/tcg-target.c.inc
- #include "tb-hash.h"
++++ b/tcg/riscv/tcg-target.c.inc
- #include "tb-context.h"
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
- #include "internal.h"
-@@ -XXX,XX +XXX,XX @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
+ static void tcg_out_goto_tb(TCGContext *s, int which)
-     }
+ {
+-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-     CPU_FOREACH(cpu) {
+     /* indirect jump method */
--        cpu_tb_jmp_cache_clear(cpu);
+     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
-+        tcg_flush_jmp_cache(cpu);
+                get_jmp_target_addr(s, which));
-     }
+diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
-     qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
+--- a/tcg/s390x/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
++++ b/tcg/s390x/tcg-target.c.inc
-     /* remove the TB from the hash list */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
-     h = tb_jmp_cache_hash_func(tb->pc);
+ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-     CPU_FOREACH(cpu) {
+                               uintptr_t jmp_rx, uintptr_t jmp_rw)
--        if (qatomic_read(&cpu->tb_jmp_cache[h]) == tb) {
+ {
--            qatomic_set(&cpu->tb_jmp_cache[h], NULL);
++    if (!HAVE_FACILITY(GEN_INST_EXT)) {
-+        CPUJumpCache *jc = cpu->tb_jmp_cache;
++        return;
 +        if (qatomic_read(&jc->array[h].tb) == tb) {
 +            qatomic_set(&jc->array[h].tb, NULL);
          }
      }
@@ -XXX,XX +XXX,XX @@ int page_unprotect(target_ulong address, uintptr_t pc)
  }
  #endif /* CONFIG_USER_ONLY */
 +/*
 + * Called by generic code at e.g. cpu reset after cpu creation,
 + * therefore we must be prepared to allocate the jump cache.
 + */
 +void tcg_flush_jmp_cache(CPUState *cpu)
 +{
 +    CPUJumpCache *jc = cpu->tb_jmp_cache;
 +
 +    if (likely(jc)) {
 +        for (int i = 0; i < TB_JMP_CACHE_SIZE; i++) {
 +            qatomic_set(&jc->array[i].tb, NULL);
 +        }
 +    } else {
 +        /* This should happen once during realize, and thus never race. */
 +        jc = g_new0(CPUJumpCache, 1);
 +        jc = qatomic_xchg(&cpu->tb_jmp_cache, jc);
 +        assert(jc == NULL);
 +    }
-+}
+     /* patch the branch destination */
-+
+     uintptr_t addr = tb->jmp_target_addr[n];
- /* This is a wrapper for common code that can not use CONFIG_SOFTMMU */
+     intptr_t disp = addr - (jmp_rx - 2);
- void tcg_flush_softmmu_tlb(CPUState *cs)
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
- {
+index XXXXXXX..XXXXXXX 100644
-diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c
+--- a/tcg/tci/tcg-target.c.inc
-index XXXXXXX..XXXXXXX 100644
++++ b/tcg/tci/tcg-target.c.inc
---- a/hw/core/cpu-common.c
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
-+++ b/hw/core/cpu-common.c
-@@ -XXX,XX +XXX,XX @@ static void cpu_common_reset(DeviceState *dev)
+ static void tcg_out_goto_tb(TCGContext *s, int which)
-     cpu->cflags_next_tb = -1;
+ {
+-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-     if (tcg_enabled()) {
+     /* indirect jump method. */
--        cpu_tb_jmp_cache_clear(cpu);
+     tcg_out_op_p(s, INDEX_op_goto_tb, (void *)get_jmp_target_addr(s, which));
--
+     set_jmp_reset_offset(s, which);
 +        tcg_flush_jmp_cache(cpu);
          tcg_flush_softmmu_tlb(cpu);
      }
  }
 diff --git a/plugins/core.c b/plugins/core.c
 index XXXXXXX..XXXXXXX 100644
 --- a/plugins/core.c
 +++ b/plugins/core.c
@@ -XXX,XX +XXX,XX @@ struct qemu_plugin_ctx *plugin_id_to_ctx_locked(qemu_plugin_id_t id)
  static void plugin_cpu_update__async(CPUState *cpu, run_on_cpu_data data)
  {
      bitmap_copy(cpu->plugin_mask, &data.host_ulong, QEMU_PLUGIN_EV_MAX);
 -    cpu_tb_jmp_cache_clear(cpu);
 +    tcg_flush_jmp_cache(cpu);
  }
  static void plugin_cpu_update__locked(gpointer k, gpointer v, gpointer udata)
 diff --git a/trace/control-target.c b/trace/control-target.c
 index XXXXXXX..XXXXXXX 100644
 --- a/trace/control-target.c
 +++ b/trace/control-target.c
@@ -XXX,XX +XXX,XX @@ static void trace_event_synchronize_vcpu_state_dynamic(
  {
      bitmap_copy(vcpu->trace_dstate, vcpu->trace_dstate_delayed,
                  CPU_TRACE_DSTATE_MAX_EVENTS);
 -    cpu_tb_jmp_cache_clear(vcpu);
 +    tcg_flush_jmp_cache(vcpu);
  }
  void trace_event_set_vcpu_state_dynamic(CPUState *vcpu,
 --
 .34.1

-[PULL 14/20] accel/tcg: Inline tb_flush_jmp_cache
+[PULL 16/22] tcg/aarch64: Reorg goto_tb implementation
-This function has two users, who use it incompatibly.
+The old implementation replaces two insns, swapping between
 In tlb_flush_page_by_mmuidx_async_0, when flushing a
 single page, we need to flush exactly two pages.
 In tlb_flush_range_by_mmuidx_async_0, when flushing a
 range of pages, we need to flush N+1 pages.
-This avoids double-flushing of jmp cache pages in a range.
+    b    <dest>
     nop
     br    x30
 and
     adrp    x30, <dest>
     addi    x30, x30, lo12:<dest>
     br    x30
+There is a race condition in which a thread could be stopped at
+the PC of the second insn, and when restarted does not see the
+complete address computation and branches to nowhere.
+The new implemetation replaces only one insn, swapping between
+    b    <dest>
+    br    tmp
+and
+    ldr    tmp, <jmp_addr>
+    br    tmp
+Reported-by: hev <r@hev.cc>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 25 ++++++++++++++-----------
+ tcg/aarch64/tcg-target.h     |  2 +-
-file changed, 14 insertions(+), 11 deletions(-)
+ tcg/aarch64/tcg-target.c.inc | 66 +++++++++++++++---------------------
 files changed, 29 insertions(+), 39 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/tcg/aarch64/tcg-target.h
-+++ b/accel/tcg/cputlb.c
++++ b/tcg/aarch64/tcg-target.h
-@@ -XXX,XX +XXX,XX @@ static void tb_jmp_cache_clear_page(CPUState *cpu, target_ulong page_addr)
+@@ -XXX,XX +XXX,XX @@
-     }
  #define TCG_TARGET_INSN_UNIT_SIZE  4
  #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
 -#define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
 +#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
  typedef enum {
      TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
      tcg_out_call_int(s, target);
  }
--static void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr)
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 -{
--    /* Discard jump cache entries for any tb which might potentially
+-    uintptr_t addr = tb->jmp_target_addr[n];
--       overlap the flushed page.  */
+-    tcg_insn_unit i1, i2;
--    tb_jmp_cache_clear_page(cpu, addr - TARGET_PAGE_SIZE);
+-    TCGType rt = TCG_TYPE_I64;
--    tb_jmp_cache_clear_page(cpu, addr);
+-    TCGReg  rd = TCG_REG_TMP;
 -    uint64_t pair;
 -
 -    ptrdiff_t offset = addr - jmp_rx;
 -
 -    if (offset == sextract64(offset, 0, 26)) {
 -        i1 = I3206_B | ((offset >> 2) & 0x3ffffff);
 -        i2 = NOP;
 -    } else {
 -        offset = (addr >> 12) - (jmp_rx >> 12);
 -
 -        /* patch ADRP */
 -        i1 = I3406_ADRP | (offset & 3) << 29 | (offset & 0x1ffffc) << (5 - 2) | rd;
 -        /* patch ADDI */
 -        i2 = I3401_ADDI | rt << 31 | (addr & 0xfff) << 10 | rd << 5 | rd;
 -    }
 -    pair = (uint64_t)i2 << 32 | i1;
 -    qatomic_set((uint64_t *)jmp_rw, pair);
 -    flush_idcache_range(jmp_rx, jmp_rw, 8);
 -}
 -
- /**
+ static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
-  * tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
+ {
-  * @desc: The CPUTLBDesc portion of the TLB
+     if (!l->has_value) {
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_by_mmuidx_async_0(CPUState *cpu,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
-     }
+ static void tcg_out_goto_tb(TCGContext *s, int which)
-     qemu_spin_unlock(&env_tlb(env)->c.lock);
+ {
+     /*
--    tb_flush_jmp_cache(cpu, addr);
+-     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
-+    /*
+-     * write can be used to patch the target address.
-+     * Discard jump cache entries for any tb which might potentially
++     * Direct branch, or indirect address load, will be patched
-+     * overlap the flushed page, which includes the previous.
++     * by tb_target_set_jmp_target.  Assert indirect load offset
-+     */
++     * in range early, regardless of direct branch distance.
-+    tb_jmp_cache_clear_page(cpu, addr - TARGET_PAGE_SIZE);
+      */
-+    tb_jmp_cache_clear_page(cpu, addr);
+-    if ((uintptr_t)s->code_ptr & 7) {
 -        tcg_out32(s, NOP);
 -    }
 +    intptr_t i_off = tcg_pcrel_diff(s, (void *)get_jmp_target_addr(s, which));
 +    tcg_debug_assert(i_off == sextract64(i_off, 0, 21));
 +
      set_jmp_insn_offset(s, which);
 -    /*
 -     * actual branch destination will be patched by
 -     * tb_target_set_jmp_target later
 -     */
 -    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
 -    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
 +    tcg_out32(s, I3206_B);
      tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
      set_jmp_reset_offset(s, which);
  }
- /**
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_range_by_mmuidx_async_0(CPUState *cpu,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-         return;
++{
-     }
++    uintptr_t d_addr = tb->jmp_target_addr[n];
++    ptrdiff_t d_offset = d_addr - jmp_rx;
--    for (target_ulong i = 0; i < d.len; i += TARGET_PAGE_SIZE) {
++    tcg_insn_unit insn;
--        tb_flush_jmp_cache(cpu, d.addr + i);
++
-+    /*
++    /* Either directly branch, or indirect branch load. */
-+     * Discard jump cache entries for any tb which might potentially
++    if (d_offset == sextract64(d_offset, 0, 28)) {
-+     * overlap the flushed pages, which includes the previous.
++        insn = deposit32(I3206_B, 0, 26, d_offset >> 2);
-+     */
++    } else {
-+    d.addr -= TARGET_PAGE_SIZE;
++        uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
-+    for (target_ulong i = 0, n = d.len / TARGET_PAGE_SIZE + 1; i < n; i++) {
++        ptrdiff_t i_offset = i_addr - jmp_rx;
-+        tb_jmp_cache_clear_page(cpu, d.addr);
++
-+        d.addr += TARGET_PAGE_SIZE;
++        /* Note that we asserted this in range in tcg_out_goto_tb. */
-     }
++        insn = deposit32(I3305_LDR | TCG_REG_TMP, 0, 5, i_offset >> 2);
- }
++    }
++    qatomic_set((uint32_t *)jmp_rw, insn);
 +    flush_idcache_range(jmp_rx, jmp_rw, 4);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
 --
 .34.1

-[PULL 19/20] tcg/ppc: Optimize 26-bit jumps
+[PULL 17/22] tcg/ppc: Reorg goto_tb implementation
-From: Leandro Lupori <leandro.lupori@eldorado.org.br>
+The old ppc64 implementation replaces 2 or 4 insns, which leaves a race
+condition in which a thread could be stopped at a PC in the middle of
-PowerPC64 processors handle direct branches better than indirect
+the sequence, and when restarted does not see the complete address
-ones, resulting in less stalled cycles and branch misses.
+computation and branches to nowhere.
-However, PPC's tb_target_set_jmp_target() was only using direct
+The new implemetation replaces only one insn, swapping between
-branches for 16-bit jumps, while PowerPC64's unconditional branch
-instructions are able to handle displacements of up to 26 bits.
+    b       <dest>
-To take advantage of this, now jumps whose displacements fit in
+and
-between 17 and 26 bits are also converted to direct branches.
+    mtctr    r31
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+falling through to a general-case indirect branch.
-Signed-off-by: Leandro Lupori <leandro.lupori@eldorado.org.br>
-[rth: Expanded some commentary.]
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/ppc/tcg-target.c.inc | 119 +++++++++++++++++++++++++++++----------
+ tcg/ppc/tcg-target.h     |   3 +-
-file changed, 88 insertions(+), 31 deletions(-)
+ tcg/ppc/tcg-target.c.inc | 158 +++++++++++----------------------------
+files changed, 44 insertions(+), 117 deletions(-)
 diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.h
 +++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@
  #ifdef _ARCH_PPC64
  # define TCG_TARGET_REG_BITS  64
 -# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
  #else
  # define TCG_TARGET_REG_BITS  32
 -# define MAX_CODE_GEN_BUFFER_SIZE  (32 * MiB)
  #endif
 +#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
  #define TCG_TARGET_NB_REGS 64
  #define TCG_TARGET_INSN_UNIT_SIZE 4
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
 @@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
      tcg_out32(s, insn);
  }
-+static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2)
+-static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2)
 -{
 -    if (HOST_BIG_ENDIAN) {
 -        return (uint64_t)i1 << 32 | i2;
 -    }
 -    return (uint64_t)i2 << 32 | i1;
 -}
 -
 -static inline void ppc64_replace2(uintptr_t rx, uintptr_t rw,
 -                                  tcg_insn_unit i0, tcg_insn_unit i1)
 -{
 -#if TCG_TARGET_REG_BITS == 64
 -    qatomic_set((uint64_t *)rw, make_pair(i0, i1));
 -    flush_idcache_range(rx, rw, 8);
 -#else
 -    qemu_build_not_reached();
 -#endif
 -}
 -
 -static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
 -                                  tcg_insn_unit i0, tcg_insn_unit i1,
 -                                  tcg_insn_unit i2, tcg_insn_unit i3)
 -{
 -    uint64_t p[2];
 -
 -    p[!HOST_BIG_ENDIAN] = make_pair(i0, i1);
 -    p[HOST_BIG_ENDIAN] = make_pair(i2, i3);
 -
 -    /*
 -     * There's no convenient way to get the compiler to allocate a pair
 -     * of registers at an even index, so copy into r6/r7 and clobber.
 -     */
 -    asm("mr  %%r6, %1\n\t"
 -        "mr  %%r7, %2\n\t"
 -        "stq %%r6, %0"
 -        : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7");
 -    flush_idcache_range(rx, rw, 16);
 -}
 -
 -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 -                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 -{
 -    tcg_insn_unit i0, i1, i2, i3;
 -    uintptr_t addr = tb->jmp_target_addr[n];
 -    intptr_t tb_diff = addr - (uintptr_t)tb->tc.ptr;
 -    intptr_t br_diff = addr - (jmp_rx + 4);
 -    intptr_t lo, hi;
 -
 -    if (TCG_TARGET_REG_BITS == 32) {
 -        intptr_t diff = addr - jmp_rx;
 -        tcg_debug_assert(in_range_b(diff));
 -        qatomic_set((uint32_t *)jmp_rw, B | (diff & 0x3fffffc));
 -        flush_idcache_range(jmp_rx, jmp_rw, 4);
 -        return;
 -    }
 -
 -    /*
 -     * For 16-bit displacements, we can use a single add + branch.
 -     * This happens quite often.
 -     */
 -    if (tb_diff == (int16_t)tb_diff) {
 -        i0 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
 -        i1 = B | (br_diff & 0x3fffffc);
 -        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
 -        return;
 -    }
 -
 -    lo = (int16_t)tb_diff;
 -    hi = (int32_t)(tb_diff - lo);
 -    assert(tb_diff == hi + lo);
 -    i0 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
 -    i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
 -
 -    /*
 -     * Without stq from 2.07, we can only update two insns,
 -     * and those must be the ones that load the target address.
 -     */
 -    if (!have_isa_2_07) {
 -        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
 -        return;
 -    }
 -
 -    /*
 -     * For 26-bit displacements, we can use a direct branch.
 -     * Otherwise we still need the indirect branch, which we
 -     * must restore after a potential direct branch write.
 -     */
 -    br_diff -= 4;
 -    if (in_range_b(br_diff)) {
 -        i2 = B | (br_diff & 0x3fffffc);
 -        i3 = NOP;
 -    } else {
 -        i2 = MTSPR | RS(TCG_REG_TB) | CTR;
 -        i3 = BCCTR | BO_ALWAYS;
 -    }
 -    ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3);
 -}
 -
  static void tcg_out_call_int(TCGContext *s, int lk,
                               const tcg_insn_unit *target)
  {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
  static void tcg_out_goto_tb(TCGContext *s, int which)
  {
 -    /* Direct jump. */
 -    if (TCG_TARGET_REG_BITS == 64) {
 -        /* Ensure the next insns are 8 or 16-byte aligned. */
 -        while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
 -            tcg_out32(s, NOP);
 -        }
 +    uintptr_t ptr = get_jmp_target_addr(s, which);
 +
 +    if (USE_REG_TB) {
 +        ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
 +        tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
 +
 +        /* Direct branch will be patched by tb_target_set_jmp_target. */
          set_jmp_insn_offset(s, which);
 -        tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
 -        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
          tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
 +
 +        /* When branch is out of range, fall through to indirect. */
 +        tcg_out32(s, BCCTR | BO_ALWAYS);
 +
 +        /* For the unlinked case, need to reset TCG_REG_TB.  */
 +        set_jmp_reset_offset(s, which);
 +        tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
 +                         -tcg_current_code_size(s));
 +    } else {
 +        /* Direct branch will be patched by tb_target_set_jmp_target. */
 +        set_jmp_insn_offset(s, which);
 +        tcg_out32(s, NOP);
 +
 +        /* When branch is out of range, fall through to indirect. */
 +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
 +        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
 +        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
          tcg_out32(s, BCCTR | BO_ALWAYS);
          set_jmp_reset_offset(s, which);
 -        if (USE_REG_TB) {
 -            /* For the unlinked case, need to reset TCG_REG_TB.  */
 -            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
 -                             -tcg_current_code_size(s));
 -        }
 -    } else {
 -        set_jmp_insn_offset(s, which);
 -        tcg_out32(s, B);
 -        set_jmp_reset_offset(s, which);
      }
  }
 +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 +{
-+    if (HOST_BIG_ENDIAN) {
++    uintptr_t addr = tb->jmp_target_addr[n];
-+        return (uint64_t)i1 << 32 | i2;
++    intptr_t diff = addr - jmp_rx;
 +    tcg_insn_unit insn;
 +
 +    if (in_range_b(diff)) {
 +        insn = B | (diff & 0x3fffffc);
 +    } else if (USE_REG_TB) {
 +        insn = MTSPR | RS(TCG_REG_TB) | CTR;
 +    } else {
 +        insn = NOP;
 +    }
-+    return (uint64_t)i2 << 32 | i1;
++
 +    qatomic_set((uint32_t *)jmp_rw, insn);
 +    flush_idcache_range(jmp_rx, jmp_rw, 4);
 +}
 +
-+static inline void ppc64_replace2(uintptr_t rx, uintptr_t rw,
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-+                                  tcg_insn_unit i0, tcg_insn_unit i1)
+                        const TCGArg args[TCG_MAX_OP_ARGS],
-+{
+                        const int const_args[TCG_MAX_OP_ARGS])
 +#if TCG_TARGET_REG_BITS == 64
 +    qatomic_set((uint64_t *)rw, make_pair(i0, i1));
 +    flush_idcache_range(rx, rw, 8);
 +#else
 +    qemu_build_not_reached();
 +#endif
 +}
 +
 +static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
 +                                  tcg_insn_unit i0, tcg_insn_unit i1,
 +                                  tcg_insn_unit i2, tcg_insn_unit i3)
 +{
 +    uint64_t p[2];
 +
 +    p[!HOST_BIG_ENDIAN] = make_pair(i0, i1);
 +    p[HOST_BIG_ENDIAN] = make_pair(i2, i3);
 +
 +    /*
 +     * There's no convenient way to get the compiler to allocate a pair
 +     * of registers at an even index, so copy into r6/r7 and clobber.
 +     */
 +    asm("mr  %%r6, %1\n\t"
 +        "mr  %%r7, %2\n\t"
 +        "stq %%r6, %0"
 +        : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7");
 +    flush_idcache_range(rx, rw, 16);
 +}
 +
  void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
                                uintptr_t jmp_rw, uintptr_t addr)
  {
 -    if (TCG_TARGET_REG_BITS == 64) {
 -        tcg_insn_unit i1, i2;
 -        intptr_t tb_diff = addr - tc_ptr;
 -        intptr_t br_diff = addr - (jmp_rx + 4);
 -        uint64_t pair;
 +    tcg_insn_unit i0, i1, i2, i3;
 +    intptr_t tb_diff = addr - tc_ptr;
 +    intptr_t br_diff = addr - (jmp_rx + 4);
 +    intptr_t lo, hi;
 -        /* This does not exercise the range of the branch, but we do
 -           still need to be able to load the new value of TCG_REG_TB.
 -           But this does still happen quite often.  */
 -        if (tb_diff == (int16_t)tb_diff) {
 -            i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
 -            i2 = B | (br_diff & 0x3fffffc);
 -        } else {
 -            intptr_t lo = (int16_t)tb_diff;
 -            intptr_t hi = (int32_t)(tb_diff - lo);
 -            assert(tb_diff == hi + lo);
 -            i1 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
 -            i2 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
 -        }
 -#if HOST_BIG_ENDIAN
 -        pair = (uint64_t)i1 << 32 | i2;
 -#else
 -        pair = (uint64_t)i2 << 32 | i1;
 -#endif
 -
 -        /* As per the enclosing if, this is ppc64.  Avoid the _Static_assert
 -           within qatomic_set that would fail to build a ppc32 host.  */
 -        qatomic_set__nocheck((uint64_t *)jmp_rw, pair);
 -        flush_idcache_range(jmp_rx, jmp_rw, 8);
 -    } else {
 +    if (TCG_TARGET_REG_BITS == 32) {
          intptr_t diff = addr - jmp_rx;
          tcg_debug_assert(in_range_b(diff));
          qatomic_set((uint32_t *)jmp_rw, B | (diff & 0x3fffffc));
          flush_idcache_range(jmp_rx, jmp_rw, 4);
 +        return;
      }
 +
 +    /*
 +     * For 16-bit displacements, we can use a single add + branch.
 +     * This happens quite often.
 +     */
 +    if (tb_diff == (int16_t)tb_diff) {
 +        i0 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
 +        i1 = B | (br_diff & 0x3fffffc);
 +        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
 +        return;
 +    }
 +
 +    lo = (int16_t)tb_diff;
 +    hi = (int32_t)(tb_diff - lo);
 +    assert(tb_diff == hi + lo);
 +    i0 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
 +    i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
 +
 +    /*
 +     * Without stq from 2.07, we can only update two insns,
 +     * and those must be the ones that load the target address.
 +     */
 +    if (!have_isa_2_07) {
 +        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
 +        return;
 +    }
 +
 +    /*
 +     * For 26-bit displacements, we can use a direct branch.
 +     * Otherwise we still need the indirect branch, which we
 +     * must restore after a potential direct branch write.
 +     */
 +    br_diff -= 4;
 +    if (in_range_b(br_diff)) {
 +        i2 = B | (br_diff & 0x3fffffc);
 +        i3 = NOP;
 +    } else {
 +        i2 = MTSPR | RS(TCG_REG_TB) | CTR;
 +        i3 = BCCTR | BO_ALWAYS;
 +    }
 +    ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3);
  }
  static void tcg_out_call_int(TCGContext *s, int lk,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
          if (s->tb_jmp_insn_offset) {
              /* Direct jump. */
              if (TCG_TARGET_REG_BITS == 64) {
 -                /* Ensure the next insns are 8-byte aligned. */
 -                if ((uintptr_t)s->code_ptr & 7) {
 +                /* Ensure the next insns are 8 or 16-byte aligned. */
 +                while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
                      tcg_out32(s, NOP);
                  }
                  s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
 --
 .34.1

-[PULL 10/20] accel/tcg: Remove PageDesc code_bitmap
+[PULL 18/22] tcg/sparc64: Remove USE_REG_TB
-This bitmap is created and discarded immediately.
+This is always true for sparc64, so this is dead since 3a5f6805c7ca.
 We gain nothing by its existence.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-Message-Id: <20220822232338.1727934-2-richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 78 ++-------------------------------------
+ tcg/sparc64/tcg-target.c.inc | 62 ++++++++++++------------------------
-file changed, 4 insertions(+), 74 deletions(-)
+file changed, 21 insertions(+), 41 deletions(-)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/tcg/sparc64/tcg-target.c.inc
-+++ b/accel/tcg/translate-all.c
++++ b/tcg/sparc64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  #define assert_memory_lock() tcg_debug_assert(have_mmap_lock())
  #endif
--#define SMC_BITMAP_USE_THRESHOLD 10
+ #define TCG_REG_TB  TCG_REG_I1
--
+-#define USE_REG_TB  (sizeof(void *) > 4)
- typedef struct PageDesc {
-     /* list of TBs intersecting this ram page */
+ static const int tcg_target_reg_alloc_order[] = {
-     uintptr_t first_tb;
+     TCG_REG_L0,
--#ifdef CONFIG_SOFTMMU
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
--    /* in order to optimize self modifying code, we count the number
+     }
--       of lookups we do to a given page to use a bitmap */
--    unsigned long *code_bitmap;
+     /* A 13-bit constant relative to the TB.  */
--    unsigned int code_write_count;
+-    if (!in_prologue && USE_REG_TB) {
--#else
++    if (!in_prologue) {
-+#ifdef CONFIG_USER_ONLY
+         test = tcg_tbrel_diff(s, (void *)arg);
-     unsigned long flags;
+         if (check_fit_ptr(test, 13)) {
-     void *target_data;
+             tcg_out_arithi(s, ret, TCG_REG_TB, test, ARITH_ADD);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
      }
      /* Use the constant pool, if possible. */
 -    if (!in_prologue && USE_REG_TB) {
 +    if (!in_prologue) {
          new_pool_label(s, arg, R_SPARC_13, s->code_ptr,
                         tcg_tbrel_diff(s, NULL));
          tcg_out32(s, LDX | INSN_RD(ret) | INSN_RS1(TCG_REG_TB));
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
  #endif
--#ifndef CONFIG_USER_ONLY
-+#ifdef CONFIG_SOFTMMU
+     /* We choose TCG_REG_TB such that no move is required.  */
-     QemuSpin lock;
+-    if (USE_REG_TB) {
- #endif
+-        QEMU_BUILD_BUG_ON(TCG_REG_TB != TCG_REG_I1);
- } PageDesc;
+-        tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);
-@@ -XXX,XX +XXX,XX @@ void tb_htable_init(void)
+-    }
-     qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode);
++    QEMU_BUILD_BUG_ON(TCG_REG_TB != TCG_REG_I1);
- }
++    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);
--/* call with @p->lock held */
+     tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I1, 0, JMPL);
--static inline void invalidate_page_bitmap(PageDesc *p)
+     /* delay slot */
--{
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
--    assert_page_locked(p);
+         tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
--#ifdef CONFIG_SOFTMMU
+         tcg_out_movi_imm13(s, TCG_REG_O0, a0);
--    g_free(p->code_bitmap);
+         return;
--    p->code_bitmap = NULL;
+-    } else if (USE_REG_TB) {
--    p->code_write_count = 0;
++    } else {
--#endif
+         intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
--}
+         if (check_fit_ptr(tb_diff, 13)) {
--
+             tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
- /* Set to NULL all the 'first_tb' fields in all PageDescs. */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
- static void page_flush_tb_1(int level, void **lp)
  static void tcg_out_goto_tb(TCGContext *s, int which)
  {
-@@ -XXX,XX +XXX,XX @@ static void page_flush_tb_1(int level, void **lp)
++    int c;
-         for (i = 0; i < V_L2_SIZE; ++i) {
++
-             page_lock(&pd[i]);
+     /* Direct jump. */
-             pd[i].first_tb = (uintptr_t)NULL;
+-    if (USE_REG_TB) {
--            invalidate_page_bitmap(pd + i);
+-        /* make sure the patch is 8-byte aligned.  */
-             page_unlock(&pd[i]);
+-        if ((intptr_t)s->code_ptr & 4) {
-         }
+-            tcg_out_nop(s);
-     } else {
+-        }
-@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
+-        set_jmp_insn_offset(s, which);
-     if (rm_from_page_list) {
+-        tcg_out_sethi(s, TCG_REG_T1, 0);
-         p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
+-        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-         tb_page_remove(p, tb);
+-        tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
--        invalidate_page_bitmap(p);
+-        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-         if (tb->page_addr[1] != -1) {
+-    } else {
-             p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
+-        set_jmp_insn_offset(s, which);
-             tb_page_remove(p, tb);
+-        tcg_out32(s, CALL);
--            invalidate_page_bitmap(p);
++    /* make sure the patch is 8-byte aligned.  */
-         }
++    if ((intptr_t)s->code_ptr & 4) {
          tcg_out_nop(s);
      }
++    set_jmp_insn_offset(s, which);
-@@ -XXX,XX +XXX,XX @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
++    tcg_out_sethi(s, TCG_REG_T1, 0);
 +    tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
 +    tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
 +    tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
      set_jmp_reset_offset(s, which);
      /*
       * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
       * to the beginning of this TB.
       */
 -    if (USE_REG_TB) {
 -        int c = -tcg_current_code_size(s);
 -        if (check_fit_i32(c, 13)) {
 -            tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
 -        } else {
 -            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
 -            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
 -        }
 +    c = -tcg_current_code_size(s);
 +    if (check_fit_i32(c, 13)) {
 +        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
 +    } else {
 +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
 +        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
      }
  }
--#ifdef CONFIG_SOFTMMU
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
--/* call with @p->lock held */
+     switch (opc) {
--static void build_page_bitmap(PageDesc *p)
+     case INDEX_op_goto_ptr:
--{
+         tcg_out_arithi(s, TCG_REG_G0, a0, 0, JMPL);
--    int n, tb_start, tb_end;
+-        if (USE_REG_TB) {
--    TranslationBlock *tb;
+-            tcg_out_mov_delay(s, TCG_REG_TB, a0);
 -        } else {
 -            tcg_out_nop(s);
 -        }
 +        tcg_out_mov_delay(s, TCG_REG_TB, a0);
          break;
      case INDEX_op_br:
          tcg_out_bpcc(s, COND_A, BPCC_PT, arg_label(a0));
@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
      tcg_debug_assert(tb_disp == (int32_t)tb_disp);
      tcg_debug_assert(br_disp == (int32_t)br_disp);
 -    if (!USE_REG_TB) {
 -        qatomic_set((uint32_t *)jmp_rw,
 -            deposit32(CALL, 0, 30, br_disp >> 2));
 -        flush_idcache_range(jmp_rx, jmp_rw, 4);
 -        return;
 -    }
 -
--    assert_page_locked(p);
+     /* This does not exercise the range of the branch, but we do
--    p->code_bitmap = bitmap_new(TARGET_PAGE_SIZE);
+        still need to be able to load the new value of TCG_REG_TB.
--
+        But this does still happen quite often.  */
 -    PAGE_FOR_EACH_TB(p, tb, n) {
 -        /* NOTE: this is subtle as a TB may span two physical pages */
 -        if (n == 0) {
 -            /* NOTE: tb_end may be after the end of the page, but
 -               it is not a problem */
 -            tb_start = tb->pc & ~TARGET_PAGE_MASK;
 -            tb_end = tb_start + tb->size;
 -            if (tb_end > TARGET_PAGE_SIZE) {
 -                tb_end = TARGET_PAGE_SIZE;
 -             }
 -        } else {
 -            tb_start = 0;
 -            tb_end = ((tb->pc + tb->size) & ~TARGET_PAGE_MASK);
 -        }
 -        bitmap_set(p->code_bitmap, tb_start, tb_end - tb_start);
 -    }
 -}
 -#endif
 -
  /* add the tb in the target page and protect it if necessary
   *
   * Called with mmap_lock held for user-mode emulation.
@@ -XXX,XX +XXX,XX @@ static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
      page_already_protected = p->first_tb != (uintptr_t)NULL;
  #endif
      p->first_tb = (uintptr_t)tb | n;
 -    invalidate_page_bitmap(p);
  #if defined(CONFIG_USER_ONLY)
      /* translator_loop() must have made all TB pages non-writable */
@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
      /* remove TB from the page(s) if we couldn't insert it */
      if (unlikely(existing_tb)) {
          tb_page_remove(p, tb);
 -        invalidate_page_bitmap(p);
          if (p2) {
              tb_page_remove(p2, tb);
 -            invalidate_page_bitmap(p2);
          }
          tb = existing_tb;
      }
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
  #if !defined(CONFIG_USER_ONLY)
      /* if no code remaining, no need to continue to use slow writes */
      if (!p->first_tb) {
 -        invalidate_page_bitmap(p);
          tlb_unprotect_code(start);
      }
  #endif
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
      }
      assert_page_locked(p);
 -    if (!p->code_bitmap &&
 -        ++p->code_write_count >= SMC_BITMAP_USE_THRESHOLD) {
 -        build_page_bitmap(p);
 -    }
 -    if (p->code_bitmap) {
 -        unsigned int nr;
 -        unsigned long b;
 -
 -        nr = start & ~TARGET_PAGE_MASK;
 -        b = p->code_bitmap[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG - 1));
 -        if (b & ((1 << len) - 1)) {
 -            goto do_invalidate;
 -        }
 -    } else {
 -    do_invalidate:
 -        tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
 -                                              retaddr);
 -    }
 +    tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
 +                                          retaddr);
  }
  #else
  /* Called with mmap_lock held. If pc is not 0 then it indicates the
 --
 .34.1

-[PULL 03/20] cputlb: used cached CPUClass in our hot-paths
+[PULL 19/22] tcg/sparc64: Reorg goto_tb implementation
-From: Alex Bennée <alex.bennee@linaro.org>
+The old sparc64 implementation may replace two insns, which leaves
 a race condition in which a thread could be stopped at a PC in the
 middle of the sequence, and when restarted does not see the complete
 address computation and branches to nowhere.
-Before: 35.912 s ±  0.168 s
+The new implemetation replaces only one insn, swapping between a
-  After: 35.565 s ±  0.087 s
+direct branch and a direct call.  The TCG_REG_TB register is loaded
 from tb->jmp_target_addr[] in the delay slot.
-Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20220811151413.3350684-5-alex.bennee@linaro.org>
 Signed-off-by: Cédric Le Goater <clg@kaod.org>
 Message-Id: <20220923084803.498337-5-clg@kaod.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 15 ++++++---------
+ tcg/sparc64/tcg-target.c.inc | 87 +++++++++++++++---------------------
-file changed, 6 insertions(+), 9 deletions(-)
+file changed, 37 insertions(+), 50 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/tcg/sparc64/tcg-target.c.inc
-+++ b/accel/tcg/cputlb.c
++++ b/tcg/sparc64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
- static void tlb_fill(CPUState *cpu, target_ulong addr, int size,
-                      MMUAccessType access_type, int mmu_idx, uintptr_t retaddr)
+ static void tcg_out_goto_tb(TCGContext *s, int which)
  {
--    CPUClass *cc = CPU_GET_CLASS(cpu);
+-    int c;
-     bool ok;
++    ptrdiff_t off = tcg_tbrel_diff(s, (void *)get_jmp_target_addr(s, which));
 -    /* Direct jump. */
 -    /* make sure the patch is 8-byte aligned.  */
 -    if ((intptr_t)s->code_ptr & 4) {
 -        tcg_out_nop(s);
 -    }
 +    /* Direct branch will be patched by tb_target_set_jmp_target. */
      set_jmp_insn_offset(s, which);
 -    tcg_out_sethi(s, TCG_REG_T1, 0);
 -    tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
 -    tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
 -    tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
 +    tcg_out32(s, CALL);
 +    /* delay slot */
 +    tcg_debug_assert(check_fit_ptr(off, 13));
 +    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, TCG_REG_TB, off);
      set_jmp_reset_offset(s, which);
      /*
-      * This is not a probe, so only valid return is success; failure
+      * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
-      * should result in exception + longjmp to the cpu loop.
+      * to the beginning of this TB.
       */
--    ok = cc->tcg_ops->tlb_fill(cpu, addr, size,
+-    c = -tcg_current_code_size(s);
--                               access_type, mmu_idx, false, retaddr);
+-    if (check_fit_i32(c, 13)) {
-+    ok = cpu->cc->tcg_ops->tlb_fill(cpu, addr, size,
+-        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
-+                                    access_type, mmu_idx, false, retaddr);
++    off = -tcg_current_code_size(s);
-     assert(ok);
++    if (check_fit_i32(off, 13)) {
 +        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, off, ARITH_ADD);
      } else {
 -        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
 +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, off);
          tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
      }
  }
-@@ -XXX,XX +XXX,XX @@ static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                                         MMUAccessType access_type,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-                                         int mmu_idx, uintptr_t retaddr)
++{
 +    uintptr_t addr = tb->jmp_target_addr[n];
 +    intptr_t br_disp = (intptr_t)(addr - jmp_rx) >> 2;
 +    tcg_insn_unit insn;
 +
 +    br_disp >>= 2;
 +    if (check_fit_ptr(br_disp, 19)) {
 +        /* ba,pt %icc, addr */
 +        insn = deposit32(INSN_OP(0) | INSN_OP2(1) | INSN_COND(COND_A)
 +                         | BPCC_ICC | BPCC_PT, 0, 19, br_disp);
 +    } else if (check_fit_ptr(br_disp, 22)) {
 +        /* ba addr */
 +        insn = deposit32(INSN_OP(0) | INSN_OP2(2) | INSN_COND(COND_A),
 +                         0, 22, br_disp);
 +    } else {
 +        /* The code_gen_buffer can't be larger than 2GB.  */
 +        tcg_debug_assert(check_fit_ptr(br_disp, 30));
 +        /* call addr */
 +        insn = deposit32(CALL, 0, 30, br_disp);
 +    }
 +
 +    qatomic_set((uint32_t *)jmp_rw, insn);
 +    flush_idcache_range(jmp_rx, jmp_rw, 4);
 +}
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const TCGArg args[TCG_MAX_OP_ARGS],
                         const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ void tcg_register_jit(const void *buf, size_t buf_size)
  {
--    CPUClass *cc = CPU_GET_CLASS(cpu);
+     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
  }
 -
--    cc->tcg_ops->do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr);
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-+    cpu->cc->tcg_ops->do_unaligned_access(cpu, addr, access_type,
+-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-+                                          mmu_idx, retaddr);
+-{
- }
+-    uintptr_t addr = tb->jmp_target_addr[n];
+-    intptr_t tb_disp = addr - (uintptr_t)tb->tc.ptr;
- static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
+-    intptr_t br_disp = addr - jmp_rx;
-@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
+-    tcg_insn_unit i1, i2;
-     if (!tlb_hit_page(tlb_addr, page_addr)) {
+-
-         if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page_addr)) {
+-    /* We can reach the entire address space for ILP32.
-             CPUState *cs = env_cpu(env);
+-       For LP64, the code_gen_buffer can't be larger than 2GB.  */
--            CPUClass *cc = CPU_GET_CLASS(cs);
+-    tcg_debug_assert(tb_disp == (int32_t)tb_disp);
+-    tcg_debug_assert(br_disp == (int32_t)br_disp);
--            if (!cc->tcg_ops->tlb_fill(cs, addr, fault_size, access_type,
+-
--                                       mmu_idx, nonfault, retaddr)) {
+-    /* This does not exercise the range of the branch, but we do
-+            if (!cs->cc->tcg_ops->tlb_fill(cs, addr, fault_size, access_type,
+-       still need to be able to load the new value of TCG_REG_TB.
-+                                           mmu_idx, nonfault, retaddr)) {
+-       But this does still happen quite often.  */
-                 /* Non-faulting page table read failed.  */
+-    if (check_fit_ptr(tb_disp, 13)) {
-                 *phost = NULL;
+-        /* ba,pt %icc, addr */
-                 return TLB_INVALID_MASK;
+-        i1 = (INSN_OP(0) | INSN_OP2(1) | INSN_COND(COND_A)
 -              | BPCC_ICC | BPCC_PT | INSN_OFF19(br_disp));
 -        i2 = (ARITH_ADD | INSN_RD(TCG_REG_TB) | INSN_RS1(TCG_REG_TB)
 -              | INSN_IMM13(tb_disp));
 -    } else if (tb_disp >= 0) {
 -        i1 = SETHI | INSN_RD(TCG_REG_T1) | ((tb_disp & 0xfffffc00) >> 10);
 -        i2 = (ARITH_OR | INSN_RD(TCG_REG_T1) | INSN_RS1(TCG_REG_T1)
 -              | INSN_IMM13(tb_disp & 0x3ff));
 -    } else {
 -        i1 = SETHI | INSN_RD(TCG_REG_T1) | ((~tb_disp & 0xfffffc00) >> 10);
 -        i2 = (ARITH_XOR | INSN_RD(TCG_REG_T1) | INSN_RS1(TCG_REG_T1)
 -              | INSN_IMM13((tb_disp & 0x3ff) | -0x400));
 -    }
 -
 -    qatomic_set((uint64_t *)jmp_rw, deposit64(i2, 32, 32, i1));
 -    flush_idcache_range(jmp_rx, jmp_rw, 8);
 -}
 --
 .34.1

-[PULL 20/20] target/sh4: Fix TB_FLAG_UNALIGN
+[PULL 20/22] tcg/arm: Implement direct branch for goto_tb
-The value previously chosen overlaps GUSA_MASK.
+Now that tcg can handle direct and indirect goto_tb
 simultaneously, we can optimistically leave space for
 a direct branch and fall back to loading the pointer
 from the TB for an indirect branch.
-Rename all DELAY_SLOT_* and GUSA_* defines to emphasize
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 that they are included in TB_FLAGs.  Add aliases for the
 FPSCR and SR bits that are included in TB_FLAGS, so that
 we don't accidentally reassign those bits.
 Fixes: 4da06fb3062 ("target/sh4: Implement prctl_unalign_sigbus")
 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/856
 Reviewed-by: Yoshinori Sato <ysato@users.sourceforge.jp>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/sh4/cpu.h        | 56 +++++++++++++------------
+ tcg/arm/tcg-target.c.inc | 52 ++++++++++++++++++++++++++++------------
- linux-user/sh4/signal.c |  6 +--
+file changed, 37 insertions(+), 15 deletions(-)
  target/sh4/cpu.c        |  6 +--
  target/sh4/helper.c     |  6 +--
  target/sh4/translate.c  | 90 ++++++++++++++++++++++-------------------
 files changed, 88 insertions(+), 76 deletions(-)
-diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/target/sh4/cpu.h
+--- a/tcg/arm/tcg-target.c.inc
-+++ b/target/sh4/cpu.h
++++ b/tcg/arm/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ typedef enum {
- #define FPSCR_RM_NEAREST       (0 << 0)
+     ARITH_BIC = 0xe << 21,
- #define FPSCR_RM_ZERO          (1 << 0)
+     ARITH_MVN = 0xf << 21,
--#define DELAY_SLOT_MASK        0x7
++    INSN_B         = 0x0a000000,
--#define DELAY_SLOT             (1 << 0)
++
--#define DELAY_SLOT_CONDITIONAL (1 << 1)
+     INSN_CLZ       = 0x016f0f10,
--#define DELAY_SLOT_RTE         (1 << 2)
+     INSN_RBIT      = 0x06ff0f30,
-+#define TB_FLAG_DELAY_SLOT       (1 << 0)
-+#define TB_FLAG_DELAY_SLOT_COND  (1 << 1)
+@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
-+#define TB_FLAG_DELAY_SLOT_RTE   (1 << 2)
-+#define TB_FLAG_PENDING_MOVCA    (1 << 3)
+ static void tcg_out_b_imm(TCGContext *s, ARMCond cond, int32_t offset)
 +#define TB_FLAG_GUSA_SHIFT       4                      /* [11:4] */
 +#define TB_FLAG_GUSA_EXCLUSIVE   (1 << 12)
 +#define TB_FLAG_UNALIGN          (1 << 13)
 +#define TB_FLAG_SR_FD            (1 << SR_FD)           /* 15 */
 +#define TB_FLAG_FPSCR_PR         FPSCR_PR               /* 19 */
 +#define TB_FLAG_FPSCR_SZ         FPSCR_SZ               /* 20 */
 +#define TB_FLAG_FPSCR_FR         FPSCR_FR               /* 21 */
 +#define TB_FLAG_SR_RB            (1 << SR_RB)           /* 29 */
 +#define TB_FLAG_SR_MD            (1 << SR_MD)           /* 30 */
 -#define TB_FLAG_PENDING_MOVCA  (1 << 3)
 -#define TB_FLAG_UNALIGN        (1 << 4)
 -
 -#define GUSA_SHIFT             4
 -#ifdef CONFIG_USER_ONLY
 -#define GUSA_EXCLUSIVE         (1 << 12)
 -#define GUSA_MASK              ((0xff << GUSA_SHIFT) | GUSA_EXCLUSIVE)
 -#else
 -/* Provide dummy versions of the above to allow tests against tbflags
 -   to be elided while avoiding ifdefs.  */
 -#define GUSA_EXCLUSIVE         0
 -#define GUSA_MASK              0
 -#endif
 -
 -#define TB_FLAG_ENVFLAGS_MASK  (DELAY_SLOT_MASK | GUSA_MASK)
 +#define TB_FLAG_DELAY_SLOT_MASK  (TB_FLAG_DELAY_SLOT |       \
 +                                  TB_FLAG_DELAY_SLOT_COND |  \
 +                                  TB_FLAG_DELAY_SLOT_RTE)
 +#define TB_FLAG_GUSA_MASK        ((0xff << TB_FLAG_GUSA_SHIFT) | \
 +                                  TB_FLAG_GUSA_EXCLUSIVE)
 +#define TB_FLAG_FPSCR_MASK       (TB_FLAG_FPSCR_PR | \
 +                                  TB_FLAG_FPSCR_SZ | \
 +                                  TB_FLAG_FPSCR_FR)
 +#define TB_FLAG_SR_MASK          (TB_FLAG_SR_FD | \
 +                                  TB_FLAG_SR_RB | \
 +                                  TB_FLAG_SR_MD)
 +#define TB_FLAG_ENVFLAGS_MASK    (TB_FLAG_DELAY_SLOT_MASK | \
 +                                  TB_FLAG_GUSA_MASK)
  typedef struct tlb_t {
      uint32_t vpn;        /* virtual page number */
@@ -XXX,XX +XXX,XX @@ static inline int cpu_mmu_index (CPUSH4State *env, bool ifetch)
  {
-     /* The instruction in a RTE delay slot is fetched in privileged
+-    tcg_out32(s, (cond << 28) | 0x0a000000 |
-        mode, but executed in user mode.  */
++    tcg_out32(s, (cond << 28) | INSN_B |
--    if (ifetch && (env->flags & DELAY_SLOT_RTE)) {
+                     (((offset - 8) >> 2) & 0x00ffffff));
-+    if (ifetch && (env->flags & TB_FLAG_DELAY_SLOT_RTE)) {
+ }
-         return 0;
-     } else {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
-         return (env->sr & (1u << SR_MD)) == 0 ? 1 : 0;
-@@ -XXX,XX +XXX,XX @@ static inline void cpu_get_tb_cpu_state(CPUSH4State *env, target_ulong *pc,
+ static void tcg_out_goto_tb(TCGContext *s, int which)
  {
-     *pc = env->pc;
+-    /* Indirect jump method */
-     /* For a gUSA region, notice the end of the region.  */
+-    intptr_t ptr, dif, dil;
--    *cs_base = env->flags & GUSA_MASK ? env->gregs[0] : 0;
+-    TCGReg base = TCG_REG_PC;
--    *flags = env->flags /* TB_FLAG_ENVFLAGS_MASK: bits 0-2, 4-12 */
++    uintptr_t i_addr;
--            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
++    intptr_t i_disp;
--            | (env->sr & ((1u << SR_MD) | (1u << SR_RB)))      /* Bits 29-30 */
--            | (env->sr & (1u << SR_FD))                        /* Bit 15 */
+-    ptr = get_jmp_target_addr(s, which);
-+    *cs_base = env->flags & TB_FLAG_GUSA_MASK ? env->gregs[0] : 0;
+-    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
-+    *flags = env->flags
+-    dil = sextract32(dif, 0, 12);
-+            | (env->fpscr & TB_FLAG_FPSCR_MASK)
+-    if (dif != dil) {
-+            | (env->sr & TB_FLAG_SR_MASK)
++    /* Direct branch will be patched by tb_target_set_jmp_target. */
-             | (env->movcal_backup ? TB_FLAG_PENDING_MOVCA : 0); /* Bit 3 */
++    set_jmp_insn_offset(s, which);
- #ifdef CONFIG_USER_ONLY
++    tcg_out32(s, INSN_NOP);
-     *flags |= TB_FLAG_UNALIGN * !env_cpu(env)->prctl_unalign_sigbus;
++
-diff --git a/linux-user/sh4/signal.c b/linux-user/sh4/signal.c
++    /* When branch is out of range, fall through to indirect. */
-index XXXXXXX..XXXXXXX 100644
++    i_addr = get_jmp_target_addr(s, which);
---- a/linux-user/sh4/signal.c
++    i_disp = tcg_pcrel_diff(s, (void *)i_addr) - 8;
-+++ b/linux-user/sh4/signal.c
++    tcg_debug_assert(i_disp < 0);
-@@ -XXX,XX +XXX,XX @@ static void restore_sigcontext(CPUSH4State *regs, struct target_sigcontext *sc)
++    if (i_disp >= -0xfff) {
-     __get_user(regs->fpul, &sc->sc_fpul);
++        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, i_disp);
++    } else {
-     regs->tra = -1;         /* disable syscall checks */
+         /*
--    regs->flags &= ~(DELAY_SLOT_MASK | GUSA_MASK);
+          * The TB is close, but outside the 12 bits addressable by
-+    regs->flags = 0;
+          * the load.  We can extend this to 20 bits with a sub of a
 -         * shifted immediate from pc.  In the vastly unlikely event
 -         * the code requires more than 1MB, we'll use 2 insns and
 -         * be no worse off.
 +         * shifted immediate from pc.
           */
 -        base = TCG_REG_R0;
 -        tcg_out_movi32(s, COND_AL, base, ptr - dil);
 +        int h = -i_disp;
 +        int l = h & 0xfff;
 +
 +        h = encode_imm_nofail(h - l);
 +        tcg_out_dat_imm(s, COND_AL, ARITH_SUB, TCG_REG_R0, TCG_REG_PC, h);
 +        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, l);
      }
 -    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
      set_jmp_reset_offset(s, which);
  }
- void setup_frame(int sig, struct target_sigaction *ka,
+ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-@@ -XXX,XX +XXX,XX @@ void setup_frame(int sig, struct target_sigaction *ka,
+                               uintptr_t jmp_rx, uintptr_t jmp_rw)
-     regs->gregs[5] = 0;
+ {
-     regs->gregs[6] = frame_addr += offsetof(typeof(*frame), sc);
+-    /* Always indirect, nothing to do */
-     regs->pc = (unsigned long) ka->_sa_handler;
++    uintptr_t addr = tb->jmp_target_addr[n];
--    regs->flags &= ~(DELAY_SLOT_MASK | GUSA_MASK);
++    ptrdiff_t offset = addr - (jmp_rx + 8);
-+    regs->flags &= ~(TB_FLAG_DELAY_SLOT_MASK | TB_FLAG_GUSA_MASK);
++    tcg_insn_unit insn;
++
-     unlock_user_struct(frame, frame_addr, 1);
++    /* Either directly branch, or fall through to indirect branch. */
-     return;
++    if (offset == sextract64(offset, 0, 26)) {
-@@ -XXX,XX +XXX,XX @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
++        /* B <addr> */
-     regs->gregs[5] = frame_addr + offsetof(typeof(*frame), info);
++        insn = deposit32((COND_AL << 28) | INSN_B, 0, 24, offset >> 2);
-     regs->gregs[6] = frame_addr + offsetof(typeof(*frame), uc);
++    } else {
-     regs->pc = (unsigned long) ka->_sa_handler;
++        insn = INSN_NOP;
--    regs->flags &= ~(DELAY_SLOT_MASK | GUSA_MASK);
++    }
-+    regs->flags &= ~(TB_FLAG_DELAY_SLOT_MASK | TB_FLAG_GUSA_MASK);
++
++    qatomic_set((uint32_t *)jmp_rw, insn);
-     unlock_user_struct(frame, frame_addr, 1);
++    flush_idcache_range(jmp_rx, jmp_rw, 4);
      return;
 diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/cpu.c
 +++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_synchronize_from_tb(CPUState *cs,
      SuperHCPU *cpu = SUPERH_CPU(cs);
      cpu->env.pc = tb_pc(tb);
 -    cpu->env.flags = tb->flags & TB_FLAG_ENVFLAGS_MASK;
 +    cpu->env.flags = tb->flags;
  }
- #ifndef CONFIG_USER_ONLY
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
@@ -XXX,XX +XXX,XX @@ static bool superh_io_recompile_replay_branch(CPUState *cs,
      SuperHCPU *cpu = SUPERH_CPU(cs);
      CPUSH4State *env = &cpu->env;
 -    if ((env->flags & ((DELAY_SLOT | DELAY_SLOT_CONDITIONAL))) != 0
 +    if ((env->flags & (TB_FLAG_DELAY_SLOT | TB_FLAG_DELAY_SLOT_COND))
          && env->pc != tb_pc(tb)) {
          env->pc -= 2;
 -        env->flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
 +        env->flags &= ~(TB_FLAG_DELAY_SLOT | TB_FLAG_DELAY_SLOT_COND);
          return true;
      }
      return false;
 diff --git a/target/sh4/helper.c b/target/sh4/helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/helper.c
 +++ b/target/sh4/helper.c
@@ -XXX,XX +XXX,XX @@ void superh_cpu_do_interrupt(CPUState *cs)
      env->sr |= (1u << SR_BL) | (1u << SR_MD) | (1u << SR_RB);
      env->lock_addr = -1;
 -    if (env->flags & DELAY_SLOT_MASK) {
 +    if (env->flags & TB_FLAG_DELAY_SLOT_MASK) {
          /* Branch instruction should be executed again before delay slot. */
      env->spc -= 2;
      /* Clear flags for exception/interrupt routine. */
 -        env->flags &= ~DELAY_SLOT_MASK;
 +        env->flags &= ~TB_FLAG_DELAY_SLOT_MASK;
      }
      if (do_exp) {
@@ -XXX,XX +XXX,XX @@ bool superh_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
          CPUSH4State *env = &cpu->env;
          /* Delay slots are indivisible, ignore interrupts */
 -        if (env->flags & DELAY_SLOT_MASK) {
 +        if (env->flags & TB_FLAG_DELAY_SLOT_MASK) {
              return false;
          } else {
              superh_cpu_do_interrupt(cs);
 diff --git a/target/sh4/translate.c b/target/sh4/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/translate.c
 +++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@ void superh_cpu_dump_state(CPUState *cs, FILE *f, int flags)
              i, env->gregs[i], i + 1, env->gregs[i + 1],
              i + 2, env->gregs[i + 2], i + 3, env->gregs[i + 3]);
      }
 -    if (env->flags & DELAY_SLOT) {
 +    if (env->flags & TB_FLAG_DELAY_SLOT) {
          qemu_printf("in delay slot (delayed_pc=0x%08x)\n",
              env->delayed_pc);
 -    } else if (env->flags & DELAY_SLOT_CONDITIONAL) {
 +    } else if (env->flags & TB_FLAG_DELAY_SLOT_COND) {
          qemu_printf("in conditional delay slot (delayed_pc=0x%08x)\n",
              env->delayed_pc);
 -    } else if (env->flags & DELAY_SLOT_RTE) {
 +    } else if (env->flags & TB_FLAG_DELAY_SLOT_RTE) {
          qemu_fprintf(f, "in rte delay slot (delayed_pc=0x%08x)\n",
                       env->delayed_pc);
      }
@@ -XXX,XX +XXX,XX @@ static inline void gen_save_cpu_state(DisasContext *ctx, bool save_pc)
  static inline bool use_exit_tb(DisasContext *ctx)
  {
 -    return (ctx->tbflags & GUSA_EXCLUSIVE) != 0;
 +    return (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE) != 0;
  }
  static bool use_goto_tb(DisasContext *ctx, target_ulong dest)
@@ -XXX,XX +XXX,XX @@ static void gen_conditional_jump(DisasContext *ctx, target_ulong dest,
      TCGLabel *l1 = gen_new_label();
      TCGCond cond_not_taken = jump_if_true ? TCG_COND_EQ : TCG_COND_NE;
 -    if (ctx->tbflags & GUSA_EXCLUSIVE) {
 +    if (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE) {
          /* When in an exclusive region, we must continue to the end.
             Therefore, exit the region on a taken branch, but otherwise
             fall through to the next instruction.  */
          tcg_gen_brcondi_i32(cond_not_taken, cpu_sr_t, 0, l1);
 -        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~GUSA_MASK);
 +        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~TB_FLAG_GUSA_MASK);
          /* Note that this won't actually use a goto_tb opcode because we
             disallow it in use_goto_tb, but it handles exit + singlestep.  */
          gen_goto_tb(ctx, 0, dest);
@@ -XXX,XX +XXX,XX @@ static void gen_delayed_conditional_jump(DisasContext * ctx)
      tcg_gen_mov_i32(ds, cpu_delayed_cond);
      tcg_gen_discard_i32(cpu_delayed_cond);
 -    if (ctx->tbflags & GUSA_EXCLUSIVE) {
 +    if (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE) {
          /* When in an exclusive region, we must continue to the end.
             Therefore, exit the region on a taken branch, but otherwise
             fall through to the next instruction.  */
          tcg_gen_brcondi_i32(TCG_COND_EQ, ds, 0, l1);
          /* Leave the gUSA region.  */
 -        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~GUSA_MASK);
 +        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~TB_FLAG_GUSA_MASK);
          gen_jump(ctx);
          gen_set_label(l1);
@@ -XXX,XX +XXX,XX @@ static inline void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
  #define XHACK(x) ((((x) & 1 ) << 4) | ((x) & 0xe))
  #define CHECK_NOT_DELAY_SLOT \
 -    if (ctx->envflags & DELAY_SLOT_MASK) {  \
 -        goto do_illegal_slot;               \
 +    if (ctx->envflags & TB_FLAG_DELAY_SLOT_MASK) {  \
 +        goto do_illegal_slot;                       \
      }
  #define CHECK_PRIVILEGED \
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
      case 0x000b:        /* rts */
      CHECK_NOT_DELAY_SLOT
      tcg_gen_mov_i32(cpu_delayed_pc, cpu_pr);
 -        ctx->envflags |= DELAY_SLOT;
 +        ctx->envflags |= TB_FLAG_DELAY_SLOT;
      ctx->delayed_pc = (uint32_t) - 1;
      return;
      case 0x0028:        /* clrmac */
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
      CHECK_NOT_DELAY_SLOT
          gen_write_sr(cpu_ssr);
      tcg_gen_mov_i32(cpu_delayed_pc, cpu_spc);
 -        ctx->envflags |= DELAY_SLOT_RTE;
 +        ctx->envflags |= TB_FLAG_DELAY_SLOT_RTE;
      ctx->delayed_pc = (uint32_t) - 1;
          ctx->base.is_jmp = DISAS_STOP;
      return;
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
      return;
      case 0xe000:        /* mov #imm,Rn */
  #ifdef CONFIG_USER_ONLY
 -        /* Detect the start of a gUSA region.  If so, update envflags
 -           and end the TB.  This will allow us to see the end of the
 -           region (stored in R0) in the next TB.  */
 +        /*
 +         * Detect the start of a gUSA region (mov #-n, r15).
 +         * If so, update envflags and end the TB.  This will allow us
 +         * to see the end of the region (stored in R0) in the next TB.
 +         */
          if (B11_8 == 15 && B7_0s < 0 &&
              (tb_cflags(ctx->base.tb) & CF_PARALLEL)) {
 -            ctx->envflags = deposit32(ctx->envflags, GUSA_SHIFT, 8, B7_0s);
 +            ctx->envflags =
 +                deposit32(ctx->envflags, TB_FLAG_GUSA_SHIFT, 8, B7_0s);
              ctx->base.is_jmp = DISAS_STOP;
          }
  #endif
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
      case 0xa000:        /* bra disp */
      CHECK_NOT_DELAY_SLOT
          ctx->delayed_pc = ctx->base.pc_next + 4 + B11_0s * 2;
 -        ctx->envflags |= DELAY_SLOT;
 +        ctx->envflags |= TB_FLAG_DELAY_SLOT;
      return;
      case 0xb000:        /* bsr disp */
      CHECK_NOT_DELAY_SLOT
          tcg_gen_movi_i32(cpu_pr, ctx->base.pc_next + 4);
          ctx->delayed_pc = ctx->base.pc_next + 4 + B11_0s * 2;
 -        ctx->envflags |= DELAY_SLOT;
 +        ctx->envflags |= TB_FLAG_DELAY_SLOT;
      return;
      }
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
      CHECK_NOT_DELAY_SLOT
          tcg_gen_xori_i32(cpu_delayed_cond, cpu_sr_t, 1);
          ctx->delayed_pc = ctx->base.pc_next + 4 + B7_0s * 2;
 -        ctx->envflags |= DELAY_SLOT_CONDITIONAL;
 +        ctx->envflags |= TB_FLAG_DELAY_SLOT_COND;
      return;
      case 0x8900:        /* bt label */
      CHECK_NOT_DELAY_SLOT
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
      CHECK_NOT_DELAY_SLOT
          tcg_gen_mov_i32(cpu_delayed_cond, cpu_sr_t);
          ctx->delayed_pc = ctx->base.pc_next + 4 + B7_0s * 2;
 -        ctx->envflags |= DELAY_SLOT_CONDITIONAL;
 +        ctx->envflags |= TB_FLAG_DELAY_SLOT_COND;
      return;
      case 0x8800:        /* cmp/eq #imm,R0 */
          tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, REG(0), B7_0s);
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
      case 0x0023:        /* braf Rn */
      CHECK_NOT_DELAY_SLOT
          tcg_gen_addi_i32(cpu_delayed_pc, REG(B11_8), ctx->base.pc_next + 4);
 -        ctx->envflags |= DELAY_SLOT;
 +        ctx->envflags |= TB_FLAG_DELAY_SLOT;
      ctx->delayed_pc = (uint32_t) - 1;
      return;
      case 0x0003:        /* bsrf Rn */
      CHECK_NOT_DELAY_SLOT
          tcg_gen_movi_i32(cpu_pr, ctx->base.pc_next + 4);
      tcg_gen_add_i32(cpu_delayed_pc, REG(B11_8), cpu_pr);
 -        ctx->envflags |= DELAY_SLOT;
 +        ctx->envflags |= TB_FLAG_DELAY_SLOT;
      ctx->delayed_pc = (uint32_t) - 1;
      return;
      case 0x4015:        /* cmp/pl Rn */
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
      case 0x402b:        /* jmp @Rn */
      CHECK_NOT_DELAY_SLOT
      tcg_gen_mov_i32(cpu_delayed_pc, REG(B11_8));
 -        ctx->envflags |= DELAY_SLOT;
 +        ctx->envflags |= TB_FLAG_DELAY_SLOT;
      ctx->delayed_pc = (uint32_t) - 1;
      return;
      case 0x400b:        /* jsr @Rn */
      CHECK_NOT_DELAY_SLOT
          tcg_gen_movi_i32(cpu_pr, ctx->base.pc_next + 4);
      tcg_gen_mov_i32(cpu_delayed_pc, REG(B11_8));
 -        ctx->envflags |= DELAY_SLOT;
 +        ctx->envflags |= TB_FLAG_DELAY_SLOT;
      ctx->delayed_pc = (uint32_t) - 1;
      return;
      case 0x400e:        /* ldc Rm,SR */
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
      fflush(stderr);
  #endif
   do_illegal:
 -    if (ctx->envflags & DELAY_SLOT_MASK) {
 +    if (ctx->envflags & TB_FLAG_DELAY_SLOT_MASK) {
   do_illegal_slot:
          gen_save_cpu_state(ctx, true);
          gen_helper_raise_slot_illegal_instruction(cpu_env);
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
   do_fpu_disabled:
      gen_save_cpu_state(ctx, true);
 -    if (ctx->envflags & DELAY_SLOT_MASK) {
 +    if (ctx->envflags & TB_FLAG_DELAY_SLOT_MASK) {
          gen_helper_raise_slot_fpu_disable(cpu_env);
      } else {
          gen_helper_raise_fpu_disable(cpu_env);
@@ -XXX,XX +XXX,XX @@ static void decode_opc(DisasContext * ctx)
      _decode_opc(ctx);
 -    if (old_flags & DELAY_SLOT_MASK) {
 +    if (old_flags & TB_FLAG_DELAY_SLOT_MASK) {
          /* go out of the delay slot */
 -        ctx->envflags &= ~DELAY_SLOT_MASK;
 +        ctx->envflags &= ~TB_FLAG_DELAY_SLOT_MASK;
          /* When in an exclusive region, we must continue to the end
             for conditional branches.  */
 -        if (ctx->tbflags & GUSA_EXCLUSIVE
 -            && old_flags & DELAY_SLOT_CONDITIONAL) {
 +        if (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE
 +            && old_flags & TB_FLAG_DELAY_SLOT_COND) {
              gen_delayed_conditional_jump(ctx);
              return;
          }
          /* Otherwise this is probably an invalid gUSA region.
             Drop the GUSA bits so the next TB doesn't see them.  */
 -        ctx->envflags &= ~GUSA_MASK;
 +        ctx->envflags &= ~TB_FLAG_GUSA_MASK;
          tcg_gen_movi_i32(cpu_flags, ctx->envflags);
 -        if (old_flags & DELAY_SLOT_CONDITIONAL) {
 +        if (old_flags & TB_FLAG_DELAY_SLOT_COND) {
          gen_delayed_conditional_jump(ctx);
          } else {
              gen_jump(ctx);
@@ -XXX,XX +XXX,XX @@ static void decode_gusa(DisasContext *ctx, CPUSH4State *env)
      }
      /* The entire region has been translated.  */
 -    ctx->envflags &= ~GUSA_MASK;
 +    ctx->envflags &= ~TB_FLAG_GUSA_MASK;
      ctx->base.pc_next = pc_end;
      ctx->base.num_insns += max_insns - 1;
      return;
@@ -XXX,XX +XXX,XX @@ static void decode_gusa(DisasContext *ctx, CPUSH4State *env)
      /* Restart with the EXCLUSIVE bit set, within a TB run via
         cpu_exec_step_atomic holding the exclusive lock.  */
 -    ctx->envflags |= GUSA_EXCLUSIVE;
 +    ctx->envflags |= TB_FLAG_GUSA_EXCLUSIVE;
      gen_save_cpu_state(ctx, false);
      gen_helper_exclusive(cpu_env);
      ctx->base.is_jmp = DISAS_NORETURN;
@@ -XXX,XX +XXX,XX @@ static void sh4_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
                    (tbflags & (1 << SR_RB))) * 0x10;
      ctx->fbank = tbflags & FPSCR_FR ? 0x10 : 0;
 -    if (tbflags & GUSA_MASK) {
 +#ifdef CONFIG_USER_ONLY
 +    if (tbflags & TB_FLAG_GUSA_MASK) {
 +        /* In gUSA exclusive region. */
          uint32_t pc = ctx->base.pc_next;
          uint32_t pc_end = ctx->base.tb->cs_base;
 -        int backup = sextract32(ctx->tbflags, GUSA_SHIFT, 8);
 +        int backup = sextract32(ctx->tbflags, TB_FLAG_GUSA_SHIFT, 8);
          int max_insns = (pc_end - pc) / 2;
          if (pc != pc_end + backup || max_insns < 2) {
              /* This is a malformed gUSA region.  Don't do anything special,
                 since the interpreter is likely to get confused.  */
 -            ctx->envflags &= ~GUSA_MASK;
 -        } else if (tbflags & GUSA_EXCLUSIVE) {
 +            ctx->envflags &= ~TB_FLAG_GUSA_MASK;
 +        } else if (tbflags & TB_FLAG_GUSA_EXCLUSIVE) {
              /* Regardless of single-stepping or the end of the page,
                 we must complete execution of the gUSA region while
                 holding the exclusive lock.  */
@@ -XXX,XX +XXX,XX @@ static void sh4_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
              return;
          }
      }
 +#endif
      /* Since the ISA is fixed-width, we can bound by the number
         of instructions remaining on the page.  */
@@ -XXX,XX +XXX,XX @@ static void sh4_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs)
      DisasContext *ctx = container_of(dcbase, DisasContext, base);
  #ifdef CONFIG_USER_ONLY
 -    if (unlikely(ctx->envflags & GUSA_MASK)
 -        && !(ctx->envflags & GUSA_EXCLUSIVE)) {
 +    if (unlikely(ctx->envflags & TB_FLAG_GUSA_MASK)
 +        && !(ctx->envflags & TB_FLAG_GUSA_EXCLUSIVE)) {
          /* We're in an gUSA region, and we have not already fallen
             back on using an exclusive region.  Attempt to parse the
             region into a single supported atomic operation.  Failure
@@ -XXX,XX +XXX,XX @@ static void sh4_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
  {
      DisasContext *ctx = container_of(dcbase, DisasContext, base);
 -    if (ctx->tbflags & GUSA_EXCLUSIVE) {
 +    if (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE) {
          /* Ending the region of exclusivity.  Clear the bits.  */
 -        ctx->envflags &= ~GUSA_MASK;
 +        ctx->envflags &= ~TB_FLAG_GUSA_MASK;
      }
      switch (ctx->base.is_jmp) {
 --
 .34.1

-[PULL 08/20] accel/tcg: Introduce tlb_set_page_full
+[PULL 21/22] tcg/riscv: Introduce OPC_NOP
-Now that we have collected all of the page data into
-CPUTLBEntryFull, provide an interface to record that
-all in one go, instead of using 4 arguments.  This interface
-allows CPUTLBEntryFull to be extended without having to
-change the number of arguments.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-defs.h | 14 +++++++++++
+ tcg/riscv/tcg-target.c.inc | 3 ++-
- include/exec/exec-all.h | 22 ++++++++++++++++++
+file changed, 2 insertions(+), 1 deletion(-)
  accel/tcg/cputlb.c      | 51 ++++++++++++++++++++++++++---------------
 files changed, 69 insertions(+), 18 deletions(-)
-diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-defs.h
+--- a/tcg/riscv/tcg-target.c.inc
-+++ b/include/exec/cpu-defs.h
++++ b/tcg/riscv/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ typedef struct CPUTLBEntryFull {
+@@ -XXX,XX +XXX,XX @@ typedef enum {
-      *     + the offset within the target MemoryRegion (otherwise)
+ #endif
-      */
-     hwaddr xlat_section;
+     OPC_FENCE = 0x0000000f,
-+
++    OPC_NOP   = OPC_ADDI,   /* nop = addi r0,r0,0 */
-+    /*
+ } RISCVInsn;
 +     * @phys_addr contains the physical address in the address space
 +     * given by cpu_asidx_from_attrs(cpu, @attrs).
 +     */
 +    hwaddr phys_addr;
 +
 +    /* @attrs contains the memory transaction attributes for the page. */
      MemTxAttrs attrs;
 +
 +    /* @prot contains the complete protections for the page. */
 +    uint8_t prot;
 +
 +    /* @lg_page_size contains the log2 of the page size. */
 +    uint8_t lg_page_size;
  } CPUTLBEntryFull;
  /*
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
-index XXXXXXX..XXXXXXX 100644
+ {
---- a/include/exec/exec-all.h
+     int i;
-+++ b/include/exec/exec-all.h
+     for (i = 0; i < count; ++i) {
-@@ -XXX,XX +XXX,XX @@ void tlb_flush_range_by_mmuidx_all_cpus_synced(CPUState *cpu,
+-        p[i] = encode_i(OPC_ADDI, TCG_REG_ZERO, TCG_REG_ZERO, 0);
-                                                uint16_t idxmap,
++        p[i] = OPC_NOP;
-                                                unsigned bits);
+     }
 +/**
 + * tlb_set_page_full:
 + * @cpu: CPU context
 + * @mmu_idx: mmu index of the tlb to modify
 + * @vaddr: virtual address of the entry to add
 + * @full: the details of the tlb entry
 + *
 + * Add an entry to @cpu tlb index @mmu_idx.  All of the fields of
 + * @full must be filled, except for xlat_section, and constitute
 + * the complete description of the translated page.
 + *
 + * This is generally called by the target tlb_fill function after
 + * having performed a successful page table walk to find the physical
 + * address and attributes for the translation.
 + *
 + * At most one entry for a given virtual address is permitted. Only a
 + * single TARGET_PAGE_SIZE region is mapped; @full->lg_page_size is only
 + * used by tlb_flush_page.
 + */
 +void tlb_set_page_full(CPUState *cpu, int mmu_idx, target_ulong vaddr,
 +                       CPUTLBEntryFull *full);
 +
  /**
   * tlb_set_page_with_attrs:
   * @cpu: CPU to add this TLB entry for
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_add_large_page(CPUArchState *env, int mmu_idx,
      env_tlb(env)->d[mmu_idx].large_page_mask = lp_mask;
  }
--/* Add a new TLB entry. At most one entry for a given virtual address
-+/*
-+ * Add a new TLB entry. At most one entry for a given virtual address
-  * is permitted. Only a single TARGET_PAGE_SIZE region is mapped, the
-  * supplied size is only used by tlb_flush_page.
-  *
-  * Called from TCG-generated code, which is under an RCU read-side
-  * critical section.
-  */
--void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
--                             hwaddr paddr, MemTxAttrs attrs, int prot,
--                             int mmu_idx, target_ulong size)
-+void tlb_set_page_full(CPUState *cpu, int mmu_idx,
-+                       target_ulong vaddr, CPUTLBEntryFull *full)
- {
-     CPUArchState *env = cpu->env_ptr;
-     CPUTLB *tlb = env_tlb(env);
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-     CPUTLBEntry *te, tn;
-     hwaddr iotlb, xlat, sz, paddr_page;
-     target_ulong vaddr_page;
--    int asidx = cpu_asidx_from_attrs(cpu, attrs);
--    int wp_flags;
-+    int asidx, wp_flags, prot;
-     bool is_ram, is_romd;
-     assert_cpu_is_self(cpu);
--    if (size <= TARGET_PAGE_SIZE) {
-+    if (full->lg_page_size <= TARGET_PAGE_BITS) {
-         sz = TARGET_PAGE_SIZE;
-     } else {
--        tlb_add_large_page(env, mmu_idx, vaddr, size);
--        sz = size;
-+        sz = (hwaddr)1 << full->lg_page_size;
-+        tlb_add_large_page(env, mmu_idx, vaddr, sz);
-     }
-     vaddr_page = vaddr & TARGET_PAGE_MASK;
--    paddr_page = paddr & TARGET_PAGE_MASK;
-+    paddr_page = full->phys_addr & TARGET_PAGE_MASK;
-+    prot = full->prot;
-+    asidx = cpu_asidx_from_attrs(cpu, full->attrs);
-     section = address_space_translate_for_iotlb(cpu, asidx, paddr_page,
--                                                &xlat, &sz, attrs, &prot);
-+                                                &xlat, &sz, full->attrs, &prot);
-     assert(sz >= TARGET_PAGE_SIZE);
-     tlb_debug("vaddr=" TARGET_FMT_lx " paddr=0x" TARGET_FMT_plx
-               " prot=%x idx=%d\n",
--              vaddr, paddr, prot, mmu_idx);
-+              vaddr, full->phys_addr, prot, mmu_idx);
-     address = vaddr_page;
--    if (size < TARGET_PAGE_SIZE) {
-+    if (full->lg_page_size < TARGET_PAGE_BITS) {
-         /* Repeat the MMU check and TLB fill on every access.  */
-         address |= TLB_INVALID_MASK;
-     }
--    if (attrs.byte_swap) {
-+    if (full->attrs.byte_swap) {
-         address |= TLB_BSWAP;
-     }
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-      * subtract here is that of the page base, and not the same as the
-      * vaddr we add back in io_readx()/io_writex()/get_page_addr_code().
-      */
-+    desc->fulltlb[index] = *full;
-     desc->fulltlb[index].xlat_section = iotlb - vaddr_page;
--    desc->fulltlb[index].attrs = attrs;
-+    desc->fulltlb[index].phys_addr = paddr_page;
-+    desc->fulltlb[index].prot = prot;
-     /* Now calculate the new entry */
-     tn.addend = addend - vaddr_page;
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-     qemu_spin_unlock(&tlb->c.lock);
- }
--/* Add a new TLB entry, but without specifying the memory
-- * transaction attributes to be used.
-- */
-+void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-+                             hwaddr paddr, MemTxAttrs attrs, int prot,
-+                             int mmu_idx, target_ulong size)
-+{
-+    CPUTLBEntryFull full = {
-+        .phys_addr = paddr,
-+        .attrs = attrs,
-+        .prot = prot,
-+        .lg_page_size = ctz64(size)
-+    };
-+
-+    assert(is_power_of_2(size));
-+    tlb_set_page_full(cpu, mmu_idx, vaddr, &full);
-+}
-+
- void tlb_set_page(CPUState *cpu, target_ulong vaddr,
-                   hwaddr paddr, int prot,
-                   int mmu_idx, target_ulong size)
 --
 .34.1

-[PULL 05/20] accel/tcg: Drop addr member from SavedIOTLB
+[PULL 22/22] tcg/riscv: Implement direct branch for goto_tb
-This field is only written, not read; remove it.
+Now that tcg can handle direct and indirect goto_tb simultaneously,
 we can optimistically leave space for a direct branch and fall back
 to loading the pointer from the TB for an indirect branch.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h | 1 -
+ tcg/riscv/tcg-target.c.inc | 19 +++++++++++++++++--
- accel/tcg/cputlb.c    | 7 +++----
+file changed, 17 insertions(+), 2 deletions(-)
 files changed, 3 insertions(+), 5 deletions(-)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/riscv/tcg-target.c.inc
-+++ b/include/hw/core/cpu.h
++++ b/tcg/riscv/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ struct CPUWatchpoint {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
-  * the memory regions get moved around  by io_writex.
-  */
+ static void tcg_out_goto_tb(TCGContext *s, int which)
  typedef struct SavedIOTLB {
 -    hwaddr addr;
      MemoryRegionSection *section;
      hwaddr mr_offset;
  } SavedIOTLB;
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
   * This is read by tlb_plugin_lookup if the fulltlb entry doesn't match
   * because of the side effect of io_writex changing memory layout.
   */
 -static void save_iotlb_data(CPUState *cs, hwaddr addr,
 -                            MemoryRegionSection *section, hwaddr mr_offset)
 +static void save_iotlb_data(CPUState *cs, MemoryRegionSection *section,
 +                            hwaddr mr_offset)
  {
- #ifdef CONFIG_PLUGIN
+-    /* indirect jump method */
-     SavedIOTLB *saved = &cs->saved_iotlb;
++    /* Direct branch will be patched by tb_target_set_jmp_target. */
--    saved->addr = addr;
++    set_jmp_insn_offset(s, which);
-     saved->section = section;
++    tcg_out32(s, OPC_JAL);
-     saved->mr_offset = mr_offset;
++
- #endif
++    /* When branch is out of range, fall through to indirect. */
-@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
+     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
-      * The memory_region_dispatch may trigger a flush/resize
+                get_jmp_target_addr(s, which));
-      * so for plugins we save the iotlb_data just in case.
+     tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
-      */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
--    save_iotlb_data(cpu, full->xlat_section, section, mr_offset);
+ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-+    save_iotlb_data(cpu, section, mr_offset);
+                               uintptr_t jmp_rx, uintptr_t jmp_rw)
+ {
-     if (!qemu_mutex_iothread_locked()) {
+-    /* Always indirect, nothing to do */
-         qemu_mutex_lock_iothread();
++    uintptr_t addr = tb->jmp_target_addr[n];
 +    ptrdiff_t offset = addr - jmp_rx;
 +    tcg_insn_unit insn;
 +
 +    /* Either directly branch, or fall through to indirect branch. */
 +    if (offset == sextreg(offset, 0, 20)) {
 +        insn = encode_uj(OPC_JAL, TCG_REG_ZERO, offset);
 +    } else {
 +        insn = OPC_NOP;
 +    }
 +    qatomic_set((uint32_t *)jmp_rw, insn);
 +    flush_idcache_range(jmp_rx, jmp_rw, 4);
  }
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 --
 .34.1

TCG patch queue, plus one target/sh4 patch that
Yoshinori Sato asked me to process.

The following changes since commit efbf38d73e5dcc4d5f8b98c6e7a12be1f3b91745:

Merge tag 'for-upstream' of git://repo.or.cz/qemu/kevin into staging (2022-10-03 15:06:07 -0400)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20221004

for you to fetch changes up to ab419fd8a035a65942de4e63effcd55ccbf1a9fe:

target/sh4: Fix TB_FLAG_UNALIGN (2022-10-04 12:33:05 -0700)

----------------------------------------------------------------
Cache CPUClass for use in hot code paths.
Add CPUTLBEntryFull, probe_access_full, tlb_set_page_full.
Add generic support for TARGET_TB_PCREL.
tcg/ppc: Optimize 26-bit jumps using STQ for POWER 2.07
target/sh4: Fix TB_FLAG_UNALIGN

----------------------------------------------------------------
Alex Bennée (3):
      cpu: cache CPUClass in CPUState for hot code paths
      hw/core/cpu-sysemu: used cached class in cpu_asidx_from_attrs
      cputlb: used cached CPUClass in our hot-paths

Leandro Lupori (1):
      tcg/ppc: Optimize 26-bit jumps

Richard Henderson (16):
      accel/tcg: Rename CPUIOTLBEntry to CPUTLBEntryFull
      accel/tcg: Drop addr member from SavedIOTLB
      accel/tcg: Suppress auto-invalidate in probe_access_internal
      accel/tcg: Introduce probe_access_full
      accel/tcg: Introduce tlb_set_page_full
      include/exec: Introduce TARGET_PAGE_ENTRY_EXTRA
      accel/tcg: Remove PageDesc code_bitmap
      accel/tcg: Use bool for page_find_alloc
      accel/tcg: Use DisasContextBase in plugin_gen_tb_start
      accel/tcg: Do not align tb->page_addr[0]
      accel/tcg: Inline tb_flush_jmp_cache
      include/hw/core: Create struct CPUJumpCache
      hw/core: Add CPUClass.get_pc
      accel/tcg: Introduce tb_pc and log_pc
      accel/tcg: Introduce TARGET_TB_PCREL
      target/sh4: Fix TB_FLAG_UNALIGN

From: Alex Bennée <alex.bennee@linaro.org>

The class cast checkers are quite expensive and always on (unlike the
dynamic case who's checks are gated by CONFIG_QOM_CAST_DEBUG). To
avoid the overhead of repeatedly checking something which should never
change we cache the CPUClass reference for use in the hot code paths.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20220811151413.3350684-3-alex.bennee@linaro.org>
Signed-off-by: Cédric Le Goater <clg@kaod.org>
Message-Id: <20220923084803.498337-3-clg@kaod.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/core/cpu.h | 9 +++++++++
 cpu.c                 | 9 ++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ typedef int (*WriteCoreDumpFunction)(const void *buf, size_t size,
  */
 #define CPU(obj) ((CPUState *)(obj))
 
+/*
+ * The class checkers bring in CPU_GET_CLASS() which is potentially
+ * expensive given the eventual call to
+ * object_class_dynamic_cast_assert(). Because of this the CPUState
+ * has a cached value for the class in cs->cc which is set up in
+ * cpu_exec_realizefn() for use in hot code paths.
+ */
 typedef struct CPUClass CPUClass;
 DECLARE_CLASS_CHECKERS(CPUClass, CPU,
                        TYPE_CPU)
@@ -XXX,XX +XXX,XX @@ struct qemu_work_item;
 struct CPUState {
     /*< private >*/
     DeviceState parent_obj;
+    /* cache to avoid expensive CPU_GET_CLASS */
+    CPUClass *cc;
     /*< public >*/
 
     int nr_cores;
diff --git a/cpu.c b/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/cpu.c
+++ b/cpu.c
@@ -XXX,XX +XXX,XX @@ const VMStateDescription vmstate_cpu_common = {
 
 void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 {
-#ifndef CONFIG_USER_ONLY
-    CPUClass *cc = CPU_GET_CLASS(cpu);
-#endif
+    /* cache the cpu class for the hotpath */
+    cpu->cc = CPU_GET_CLASS(cpu);
 
     cpu_list_add(cpu);
     if (!accel_cpu_realizefn(cpu, errp)) {
@@ -XXX,XX +XXX,XX @@ void cpu_exec_realizefn(CPUState *cpu, Error **errp)
     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
         vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
     }
-    if (cc->sysemu_ops->legacy_vmsd != NULL) {
-        vmstate_register(NULL, cpu->cpu_index, cc->sysemu_ops->legacy_vmsd, cpu);
+    if (cpu->cc->sysemu_ops->legacy_vmsd != NULL) {
+        vmstate_register(NULL, cpu->cpu_index, cpu->cc->sysemu_ops->legacy_vmsd, cpu);
     }
 #endif /* CONFIG_USER_ONLY */
 }
-- 
2.34.1

From: Alex Bennée <alex.bennee@linaro.org>

This is a heavily used function so lets avoid the cost of
CPU_GET_CLASS. On the romulus-bmc run it has a modest effect:

Before: 36.812 s ±  0.506 s
  After:  35.912 s ±  0.168 s

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20220811151413.3350684-4-alex.bennee@linaro.org>
Signed-off-by: Cédric Le Goater <clg@kaod.org>
Message-Id: <20220923084803.498337-4-clg@kaod.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 hw/core/cpu-sysemu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/hw/core/cpu-sysemu.c b/hw/core/cpu-sysemu.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/core/cpu-sysemu.c
+++ b/hw/core/cpu-sysemu.c
@@ -XXX,XX +XXX,XX @@ hwaddr cpu_get_phys_page_debug(CPUState *cpu, vaddr addr)
 
 int cpu_asidx_from_attrs(CPUState *cpu, MemTxAttrs attrs)
 {
-    CPUClass *cc = CPU_GET_CLASS(cpu);
     int ret = 0;
 
-    if (cc->sysemu_ops->asidx_from_attrs) {
-        ret = cc->sysemu_ops->asidx_from_attrs(cpu, attrs);
+    if (cpu->cc->sysemu_ops->asidx_from_attrs) {
+        ret = cpu->cc->sysemu_ops->asidx_from_attrs(cpu, attrs);
         assert(ret < cpu->num_ases && ret >= 0);
     }
     return ret;
-- 
2.34.1

From: Alex Bennée <alex.bennee@linaro.org>

Before: 35.912 s ±  0.168 s
  After: 35.565 s ±  0.087 s

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20220811151413.3350684-5-alex.bennee@linaro.org>
Signed-off-by: Cédric Le Goater <clg@kaod.org>
Message-Id: <20220923084803.498337-5-clg@kaod.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
 static void tlb_fill(CPUState *cpu, target_ulong addr, int size,
                      MMUAccessType access_type, int mmu_idx, uintptr_t retaddr)
 {
-    CPUClass *cc = CPU_GET_CLASS(cpu);
     bool ok;
 
     /*
      * This is not a probe, so only valid return is success; failure
      * should result in exception + longjmp to the cpu loop.
      */
-    ok = cc->tcg_ops->tlb_fill(cpu, addr, size,
-                               access_type, mmu_idx, false, retaddr);
+    ok = cpu->cc->tcg_ops->tlb_fill(cpu, addr, size,
+                                    access_type, mmu_idx, false, retaddr);
     assert(ok);
 }
 
@@ -XXX,XX +XXX,XX @@ static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
                                         MMUAccessType access_type,
                                         int mmu_idx, uintptr_t retaddr)
 {
-    CPUClass *cc = CPU_GET_CLASS(cpu);
-
-    cc->tcg_ops->do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr);
+    cpu->cc->tcg_ops->do_unaligned_access(cpu, addr, access_type,
+                                          mmu_idx, retaddr);
 }
 
 static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
     if (!tlb_hit_page(tlb_addr, page_addr)) {
         if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page_addr)) {
             CPUState *cs = env_cpu(env);
-            CPUClass *cc = CPU_GET_CLASS(cs);
 
-            if (!cc->tcg_ops->tlb_fill(cs, addr, fault_size, access_type,
-                                       mmu_idx, nonfault, retaddr)) {
+            if (!cs->cc->tcg_ops->tlb_fill(cs, addr, fault_size, access_type,
+                                           mmu_idx, nonfault, retaddr)) {
                 /* Non-faulting page table read failed.  */
                 *phost = NULL;
                 return TLB_INVALID_MASK;
-- 
2.34.1

This structure will shortly contain more than just
data for accessing MMIO.  Rename the 'addr' member
to 'xlat_section' to more clearly indicate its purpose.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-defs.h    |  22 ++++----
 accel/tcg/cputlb.c         | 102 +++++++++++++++++++------------------
 target/arm/mte_helper.c    |  14 ++---
 target/arm/sve_helper.c    |   4 +-
 target/arm/translate-a64.c |   2 +-
 5 files changed, 73 insertions(+), 71 deletions(-)

diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -XXX,XX +XXX,XX @@ typedef uint64_t target_ulong;
 #  endif
 # endif
 
+/* Minimalized TLB entry for use by TCG fast path. */
 typedef struct CPUTLBEntry {
     /* bit TARGET_LONG_BITS to TARGET_PAGE_BITS : virtual address
        bit TARGET_PAGE_BITS-1..4  : Nonzero for accesses that should not
@@ -XXX,XX +XXX,XX @@ typedef struct CPUTLBEntry {
 
 QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS));
 
-/* The IOTLB is not accessed directly inline by generated TCG code,
- * so the CPUIOTLBEntry layout is not as critical as that of the
- * CPUTLBEntry. (This is also why we don't want to combine the two
- * structs into one.)
+/*
+ * The full TLB entry, which is not accessed by generated TCG code,
+ * so the layout is not as critical as that of CPUTLBEntry. This is
+ * also why we don't want to combine the two structs.
  */
-typedef struct CPUIOTLBEntry {
+typedef struct CPUTLBEntryFull {
     /*
-     * @addr contains:
+     * @xlat_section contains:
      *  - in the lower TARGET_PAGE_BITS, a physical section number
      *  - with the lower TARGET_PAGE_BITS masked off, an offset which
      *    must be added to the virtual address to obtain:
@@ -XXX,XX +XXX,XX @@ typedef struct CPUIOTLBEntry {
      *       number is PHYS_SECTION_NOTDIRTY or PHYS_SECTION_ROM)
      *     + the offset within the target MemoryRegion (otherwise)
      */
-    hwaddr addr;
+    hwaddr xlat_section;
     MemTxAttrs attrs;
-} CPUIOTLBEntry;
+} CPUTLBEntryFull;
 
 /*
  * Data elements that are per MMU mode, minus the bits accessed by
@@ -XXX,XX +XXX,XX @@ typedef struct CPUTLBDesc {
     size_t vindex;
     /* The tlb victim table, in two parts.  */
     CPUTLBEntry vtable[CPU_VTLB_SIZE];
-    CPUIOTLBEntry viotlb[CPU_VTLB_SIZE];
-    /* The iotlb.  */
-    CPUIOTLBEntry *iotlb;
+    CPUTLBEntryFull vfulltlb[CPU_VTLB_SIZE];
+    CPUTLBEntryFull *fulltlb;
 } CPUTLBDesc;
 
 /*
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast,
     }
 
     g_free(fast->table);
-    g_free(desc->iotlb);
+    g_free(desc->fulltlb);
 
     tlb_window_reset(desc, now, 0);
     /* desc->n_used_entries is cleared by the caller */
     fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
     fast->table = g_try_new(CPUTLBEntry, new_size);
-    desc->iotlb = g_try_new(CPUIOTLBEntry, new_size);
+    desc->fulltlb = g_try_new(CPUTLBEntryFull, new_size);
 
     /*
      * If the allocations fail, try smaller sizes. We just freed some
@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast,
      * allocations to fail though, so we progressively reduce the allocation
      * size, aborting if we cannot even allocate the smallest TLB we support.
      */
-    while (fast->table == NULL || desc->iotlb == NULL) {
+    while (fast->table == NULL || desc->fulltlb == NULL) {
         if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) {
             error_report("%s: %s", __func__, strerror(errno));
             abort();
@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast,
         fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
 
         g_free(fast->table);
-        g_free(desc->iotlb);
+        g_free(desc->fulltlb);
         fast->table = g_try_new(CPUTLBEntry, new_size);
-        desc->iotlb = g_try_new(CPUIOTLBEntry, new_size);
+        desc->fulltlb = g_try_new(CPUTLBEntryFull, new_size);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tlb_mmu_init(CPUTLBDesc *desc, CPUTLBDescFast *fast, int64_t now)
     desc->n_used_entries = 0;
     fast->mask = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
     fast->table = g_new(CPUTLBEntry, n_entries);
-    desc->iotlb = g_new(CPUIOTLBEntry, n_entries);
+    desc->fulltlb = g_new(CPUTLBEntryFull, n_entries);
     tlb_mmu_flush_locked(desc, fast);
 }
 
@@ -XXX,XX +XXX,XX @@ void tlb_destroy(CPUState *cpu)
         CPUTLBDescFast *fast = &env_tlb(env)->f[i];
 
         g_free(fast->table);
-        g_free(desc->iotlb);
+        g_free(desc->fulltlb);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
 
         /* Evict the old entry into the victim tlb.  */
         copy_tlb_helper_locked(tv, te);
-        desc->viotlb[vidx] = desc->iotlb[index];
+        desc->vfulltlb[vidx] = desc->fulltlb[index];
         tlb_n_used_entries_dec(env, mmu_idx);
     }
 
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
      * subtract here is that of the page base, and not the same as the
      * vaddr we add back in io_readx()/io_writex()/get_page_addr_code().
      */
-    desc->iotlb[index].addr = iotlb - vaddr_page;
-    desc->iotlb[index].attrs = attrs;
+    desc->fulltlb[index].xlat_section = iotlb - vaddr_page;
+    desc->fulltlb[index].attrs = attrs;
 
     /* Now calculate the new entry */
     tn.addend = addend - vaddr_page;
@@ -XXX,XX +XXX,XX @@ static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
     }
 }
 
-static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
                          int mmu_idx, target_ulong addr, uintptr_t retaddr,
                          MMUAccessType access_type, MemOp op)
 {
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     bool locked = false;
     MemTxResult r;
 
-    section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
+    section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
     mr = section->mr;
-    mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+    mr_offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
     cpu->mem_io_pc = retaddr;
     if (!cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
         qemu_mutex_lock_iothread();
         locked = true;
     }
-    r = memory_region_dispatch_read(mr, mr_offset, &val, op, iotlbentry->attrs);
+    r = memory_region_dispatch_read(mr, mr_offset, &val, op, full->attrs);
     if (r != MEMTX_OK) {
         hwaddr physaddr = mr_offset +
             section->offset_within_address_space -
             section->offset_within_region;
 
         cpu_transaction_failed(cpu, physaddr, addr, memop_size(op), access_type,
-                               mmu_idx, iotlbentry->attrs, r, retaddr);
+                               mmu_idx, full->attrs, r, retaddr);
     }
     if (locked) {
         qemu_mutex_unlock_iothread();
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
 }
 
 /*
- * Save a potentially trashed IOTLB entry for later lookup by plugin.
- * This is read by tlb_plugin_lookup if the iotlb entry doesn't match
+ * Save a potentially trashed CPUTLBEntryFull for later lookup by plugin.
+ * This is read by tlb_plugin_lookup if the fulltlb entry doesn't match
  * because of the side effect of io_writex changing memory layout.
  */
 static void save_iotlb_data(CPUState *cs, hwaddr addr,
@@ -XXX,XX +XXX,XX @@ static void save_iotlb_data(CPUState *cs, hwaddr addr,
 #endif
 }
 
-static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
                       int mmu_idx, uint64_t val, target_ulong addr,
                       uintptr_t retaddr, MemOp op)
 {
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     bool locked = false;
     MemTxResult r;
 
-    section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
+    section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
     mr = section->mr;
-    mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+    mr_offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
     if (!cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
      * The memory_region_dispatch may trigger a flush/resize
      * so for plugins we save the iotlb_data just in case.
      */
-    save_iotlb_data(cpu, iotlbentry->addr, section, mr_offset);
+    save_iotlb_data(cpu, full->xlat_section, section, mr_offset);
 
     if (!qemu_mutex_iothread_locked()) {
         qemu_mutex_lock_iothread();
         locked = true;
     }
-    r = memory_region_dispatch_write(mr, mr_offset, val, op, iotlbentry->attrs);
+    r = memory_region_dispatch_write(mr, mr_offset, val, op, full->attrs);
     if (r != MEMTX_OK) {
         hwaddr physaddr = mr_offset +
             section->offset_within_address_space -
             section->offset_within_region;
 
         cpu_transaction_failed(cpu, physaddr, addr, memop_size(op),
-                               MMU_DATA_STORE, mmu_idx, iotlbentry->attrs, r,
+                               MMU_DATA_STORE, mmu_idx, full->attrs, r,
                                retaddr);
     }
     if (locked) {
@@ -XXX,XX +XXX,XX @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
             copy_tlb_helper_locked(vtlb, &tmptlb);
             qemu_spin_unlock(&env_tlb(env)->c.lock);
 
-            CPUIOTLBEntry tmpio, *io = &env_tlb(env)->d[mmu_idx].iotlb[index];
-            CPUIOTLBEntry *vio = &env_tlb(env)->d[mmu_idx].viotlb[vidx];
-            tmpio = *io; *io = *vio; *vio = tmpio;
+            CPUTLBEntryFull *f1 = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+            CPUTLBEntryFull *f2 = &env_tlb(env)->d[mmu_idx].vfulltlb[vidx];
+            CPUTLBEntryFull tmpf;
+            tmpf = *f1; *f1 = *f2; *f2 = tmpf;
             return true;
         }
     }
@@ -XXX,XX +XXX,XX @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
                  (ADDR) & TARGET_PAGE_MASK)
 
 static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
-                           CPUIOTLBEntry *iotlbentry, uintptr_t retaddr)
+                           CPUTLBEntryFull *full, uintptr_t retaddr)
 {
-    ram_addr_t ram_addr = mem_vaddr + iotlbentry->addr;
+    ram_addr_t ram_addr = mem_vaddr + full->xlat_section;
 
     trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
 
@@ -XXX,XX +XXX,XX @@ int probe_access_flags(CPUArchState *env, target_ulong addr,
     /* Handle clean RAM pages.  */
     if (unlikely(flags & TLB_NOTDIRTY)) {
         uintptr_t index = tlb_index(env, mmu_idx, addr);
-        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+        CPUTLBEntryFull *full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
 
-        notdirty_write(env_cpu(env), addr, 1, iotlbentry, retaddr);
+        notdirty_write(env_cpu(env), addr, 1, full, retaddr);
         flags &= ~TLB_NOTDIRTY;
     }
 
@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
 
     if (unlikely(flags & (TLB_NOTDIRTY | TLB_WATCHPOINT))) {
         uintptr_t index = tlb_index(env, mmu_idx, addr);
-        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+        CPUTLBEntryFull *full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
 
         /* Handle watchpoints.  */
         if (flags & TLB_WATCHPOINT) {
             int wp_access = (access_type == MMU_DATA_STORE
                              ? BP_MEM_WRITE : BP_MEM_READ);
             cpu_check_watchpoint(env_cpu(env), addr, size,
-                                 iotlbentry->attrs, wp_access, retaddr);
+                                 full->attrs, wp_access, retaddr);
         }
 
         /* Handle clean RAM pages.  */
         if (flags & TLB_NOTDIRTY) {
-            notdirty_write(env_cpu(env), addr, 1, iotlbentry, retaddr);
+            notdirty_write(env_cpu(env), addr, 1, full, retaddr);
         }
     }
 
@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
  * should have just filled the TLB. The one corner case is io_writex
  * which can cause TLB flushes and potential resizing of the TLBs
  * losing the information we need. In those cases we need to recover
- * data from a copy of the iotlbentry. As long as this always occurs
+ * data from a copy of the CPUTLBEntryFull. As long as this always occurs
  * from the same thread (which a mem callback will be) this is safe.
  */
 
@@ -XXX,XX +XXX,XX @@ bool tlb_plugin_lookup(CPUState *cpu, target_ulong addr, int mmu_idx,
     if (likely(tlb_hit(tlb_addr, addr))) {
         /* We must have an iotlb entry for MMIO */
         if (tlb_addr & TLB_MMIO) {
-            CPUIOTLBEntry *iotlbentry;
-            iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+            CPUTLBEntryFull *full;
+            full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
             data->is_io = true;
-            data->v.io.section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
-            data->v.io.offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+            data->v.io.section =
+                iotlb_to_section(cpu, full->xlat_section, full->attrs);
+            data->v.io.offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
         } else {
             data->is_io = false;
             data->v.ram.hostaddr = (void *)((uintptr_t)addr + tlbe->addend);
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 
     if (unlikely(tlb_addr & TLB_NOTDIRTY)) {
         notdirty_write(env_cpu(env), addr, size,
-                       &env_tlb(env)->d[mmu_idx].iotlb[index], retaddr);
+                       &env_tlb(env)->d[mmu_idx].fulltlb[index], retaddr);
     }
 
     return hostaddr;
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, MemOpIdx oi,
 
     /* Handle anything that isn't just a straight memory access.  */
     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
+        CPUTLBEntryFull *full;
         bool need_swap;
 
         /* For anything that is unaligned, recurse through full_load.  */
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, MemOpIdx oi,
             goto do_unaligned_access;
         }
 
-        iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+        full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
 
         /* Handle watchpoints.  */
         if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
             /* On watchpoint hit, this will longjmp out.  */
             cpu_check_watchpoint(env_cpu(env), addr, size,
-                                 iotlbentry->attrs, BP_MEM_READ, retaddr);
+                                 full->attrs, BP_MEM_READ, retaddr);
         }
 
         need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
 
         /* Handle I/O access.  */
         if (likely(tlb_addr & TLB_MMIO)) {
-            return io_readx(env, iotlbentry, mmu_idx, addr, retaddr,
+            return io_readx(env, full, mmu_idx, addr, retaddr,
                             access_type, op ^ (need_swap * MO_BSWAP));
         }
 
@@ -XXX,XX +XXX,XX @@ store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
      */
     if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
         cpu_check_watchpoint(env_cpu(env), addr, size - size2,
-                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
+                             env_tlb(env)->d[mmu_idx].fulltlb[index].attrs,
                              BP_MEM_WRITE, retaddr);
     }
     if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
         cpu_check_watchpoint(env_cpu(env), page2, size2,
-                             env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
+                             env_tlb(env)->d[mmu_idx].fulltlb[index2].attrs,
                              BP_MEM_WRITE, retaddr);
     }
 
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
 
     /* Handle anything that isn't just a straight memory access.  */
     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
+        CPUTLBEntryFull *full;
         bool need_swap;
 
         /* For anything that is unaligned, recurse through byte stores.  */
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
             goto do_unaligned_access;
         }
 
-        iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+        full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
 
         /* Handle watchpoints.  */
         if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
             /* On watchpoint hit, this will longjmp out.  */
             cpu_check_watchpoint(env_cpu(env), addr, size,
-                                 iotlbentry->attrs, BP_MEM_WRITE, retaddr);
+                                 full->attrs, BP_MEM_WRITE, retaddr);
         }
 
         need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
 
         /* Handle I/O access.  */
         if (tlb_addr & TLB_MMIO) {
-            io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr,
+            io_writex(env, full, mmu_idx, val, addr, retaddr,
                       op ^ (need_swap * MO_BSWAP));
             return;
         }
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
 
         /* Handle clean RAM pages.  */
         if (tlb_addr & TLB_NOTDIRTY) {
-            notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
+            notdirty_write(env_cpu(env), addr, size, full, retaddr);
         }
 
         haddr = (void *)((uintptr_t)addr + entry->addend);
diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/mte_helper.c
+++ b/target/arm/mte_helper.c
@@ -XXX,XX +XXX,XX @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
     return tags + index;
 #else
     uintptr_t index;
-    CPUIOTLBEntry *iotlbentry;
+    CPUTLBEntryFull *full;
     int in_page, flags;
     ram_addr_t ptr_ra;
     hwaddr ptr_paddr, tag_paddr, xlat;
@@ -XXX,XX +XXX,XX @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
     assert(!(flags & TLB_INVALID_MASK));
 
     /*
-     * Find the iotlbentry for ptr.  This *must* be present in the TLB
+     * Find the CPUTLBEntryFull for ptr.  This *must* be present in the TLB
      * because we just found the mapping.
      * TODO: Perhaps there should be a cputlb helper that returns a
      * matching tlb entry + iotlb entry.
@@ -XXX,XX +XXX,XX @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
         g_assert(tlb_hit(comparator, ptr));
     }
 # endif
-    iotlbentry = &env_tlb(env)->d[ptr_mmu_idx].iotlb[index];
+    full = &env_tlb(env)->d[ptr_mmu_idx].fulltlb[index];
 
     /* If the virtual page MemAttr != Tagged, access unchecked. */
-    if (!arm_tlb_mte_tagged(&iotlbentry->attrs)) {
+    if (!arm_tlb_mte_tagged(&full->attrs)) {
         return NULL;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
         int wp = ptr_access == MMU_DATA_LOAD ? BP_MEM_READ : BP_MEM_WRITE;
         assert(ra != 0);
         cpu_check_watchpoint(env_cpu(env), ptr, ptr_size,
-                             iotlbentry->attrs, wp, ra);
+                             full->attrs, wp, ra);
     }
 
     /*
@@ -XXX,XX +XXX,XX @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
     tag_paddr = ptr_paddr >> (LOG2_TAG_GRANULE + 1);
 
     /* Look up the address in tag space. */
-    tag_asi = iotlbentry->attrs.secure ? ARMASIdx_TagS : ARMASIdx_TagNS;
+    tag_asi = full->attrs.secure ? ARMASIdx_TagS : ARMASIdx_TagNS;
     tag_as = cpu_get_address_space(env_cpu(env), tag_asi);
     mr = address_space_translate(tag_as, tag_paddr, &xlat, NULL,
                                  tag_access == MMU_DATA_STORE,
-                                 iotlbentry->attrs);
+                                 full->attrs);
 
     /*
      * Note that @mr will never be NULL.  If there is nothing in the address
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -XXX,XX +XXX,XX @@ bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
         g_assert(tlb_hit(comparator, addr));
 # endif
 
-        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
-        info->attrs = iotlbentry->attrs;
+        CPUTLBEntryFull *full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+        info->attrs = full->attrs;
     }
 #endif
 
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -XXX,XX +XXX,XX @@ static bool is_guarded_page(CPUARMState *env, DisasContext *s)
      * table entry even for that case.
      */
     return (tlb_hit(entry->addr_code, addr) &&
-            arm_tlb_bti_gp(&env_tlb(env)->d[mmu_idx].iotlb[index].attrs));
+            arm_tlb_bti_gp(&env_tlb(env)->d[mmu_idx].fulltlb[index].attrs));
 #endif
 }
 
-- 
2.34.1

This field is only written, not read; remove it.

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ struct CPUWatchpoint {
  * the memory regions get moved around  by io_writex.
  */
 typedef struct SavedIOTLB {
-    hwaddr addr;
     MemoryRegionSection *section;
     hwaddr mr_offset;
 } SavedIOTLB;
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
  * This is read by tlb_plugin_lookup if the fulltlb entry doesn't match
  * because of the side effect of io_writex changing memory layout.
  */
-static void save_iotlb_data(CPUState *cs, hwaddr addr,
-                            MemoryRegionSection *section, hwaddr mr_offset)
+static void save_iotlb_data(CPUState *cs, MemoryRegionSection *section,
+                            hwaddr mr_offset)
 {
 #ifdef CONFIG_PLUGIN
     SavedIOTLB *saved = &cs->saved_iotlb;
-    saved->addr = addr;
     saved->section = section;
     saved->mr_offset = mr_offset;
 #endif
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
      * The memory_region_dispatch may trigger a flush/resize
      * so for plugins we save the iotlb_data just in case.
      */
-    save_iotlb_data(cpu, full->xlat_section, section, mr_offset);
+    save_iotlb_data(cpu, section, mr_offset);
 
     if (!qemu_mutex_iothread_locked()) {
         qemu_mutex_lock_iothread();
-- 
2.34.1

When PAGE_WRITE_INV is set when calling tlb_set_page,
we immediately set TLB_INVALID_MASK in order to force
tlb_fill to be called on the next lookup.  Here in
probe_access_internal, we have just called tlb_fill
and eliminated true misses, thus the lookup must be valid.

This allows us to remove a warning comment from s390x.
There doesn't seem to be a reason to change the code though.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c            | 10 +++++++++-
 target/s390x/tcg/mem_helper.c |  4 ----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
     }
     tlb_addr = tlb_read_ofs(entry, elt_ofs);
 
+    flags = TLB_FLAGS_MASK;
     page_addr = addr & TARGET_PAGE_MASK;
     if (!tlb_hit_page(tlb_addr, page_addr)) {
         if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page_addr)) {
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
 
             /* TLB resize via tlb_fill may have moved the entry.  */
             entry = tlb_entry(env, mmu_idx, addr);
+
+            /*
+             * With PAGE_WRITE_INV, we set TLB_INVALID_MASK immediately,
+             * to force the next access through tlb_fill.  We've just
+             * called tlb_fill, so we know that this entry *is* valid.
+             */
+            flags &= ~TLB_INVALID_MASK;
         }
         tlb_addr = tlb_read_ofs(entry, elt_ofs);
     }
-    flags = tlb_addr & TLB_FLAGS_MASK;
+    flags &= tlb_addr;
 
     /* Fold all "mmio-like" bits into TLB_MMIO.  This is not RAM.  */
     if (unlikely(flags & ~(TLB_WATCHPOINT | TLB_NOTDIRTY))) {
diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/mem_helper.c
+++ b/target/s390x/tcg/mem_helper.c
@@ -XXX,XX +XXX,XX @@ static int s390_probe_access(CPUArchState *env, target_ulong addr, int size,
 #else
     int flags;
 
-    /*
-     * For !CONFIG_USER_ONLY, we cannot rely on TLB_INVALID_MASK or haddr==NULL
-     * to detect if there was an exception during tlb_fill().
-     */
     env->tlb_fill_exc = 0;
     flags = probe_access_flags(env, addr, access_type, mmu_idx, nonfault, phost,
                                ra);
-- 
2.34.1

Add an interface to return the CPUTLBEntryFull struct
that goes with the lookup.  The result is not intended
to be valid across multiple lookups, so the user must
use the results immediately.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h | 15 +++++++++++++
 include/qemu/typedefs.h |  1 +
 accel/tcg/cputlb.c      | 47 +++++++++++++++++++++++++----------------
 3 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ int probe_access_flags(CPUArchState *env, target_ulong addr,
                        MMUAccessType access_type, int mmu_idx,
                        bool nonfault, void **phost, uintptr_t retaddr);
 
+#ifndef CONFIG_USER_ONLY
+/**
+ * probe_access_full:
+ * Like probe_access_flags, except also return into @pfull.
+ *
+ * The CPUTLBEntryFull structure returned via @pfull is transient
+ * and must be consumed or copied immediately, before any further
+ * access or changes to TLB @mmu_idx.
+ */
+int probe_access_full(CPUArchState *env, target_ulong addr,
+                      MMUAccessType access_type, int mmu_idx,
+                      bool nonfault, void **phost,
+                      CPUTLBEntryFull **pfull, uintptr_t retaddr);
+#endif
+
 #define CODE_GEN_ALIGN           16 /* must be >= of the size of a icache line */
 
 /* Estimated block size for TB allocation.  */
diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -XXX,XX +XXX,XX @@ typedef struct ConfidentialGuestSupport ConfidentialGuestSupport;
 typedef struct CPUAddressSpace CPUAddressSpace;
 typedef struct CPUArchState CPUArchState;
 typedef struct CPUState CPUState;
+typedef struct CPUTLBEntryFull CPUTLBEntryFull;
 typedef struct DeviceListener DeviceListener;
 typedef struct DeviceState DeviceState;
 typedef struct DirtyBitmapSnapshot DirtyBitmapSnapshot;
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
 static int probe_access_internal(CPUArchState *env, target_ulong addr,
                                  int fault_size, MMUAccessType access_type,
                                  int mmu_idx, bool nonfault,
-                                 void **phost, uintptr_t retaddr)
+                                 void **phost, CPUTLBEntryFull **pfull,
+                                 uintptr_t retaddr)
 {
     uintptr_t index = tlb_index(env, mmu_idx, addr);
     CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
                                            mmu_idx, nonfault, retaddr)) {
                 /* Non-faulting page table read failed.  */
                 *phost = NULL;
+                *pfull = NULL;
                 return TLB_INVALID_MASK;
             }
 
             /* TLB resize via tlb_fill may have moved the entry.  */
+            index = tlb_index(env, mmu_idx, addr);
             entry = tlb_entry(env, mmu_idx, addr);
 
             /*
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
     }
     flags &= tlb_addr;
 
+    *pfull = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+
     /* Fold all "mmio-like" bits into TLB_MMIO.  This is not RAM.  */
     if (unlikely(flags & ~(TLB_WATCHPOINT | TLB_NOTDIRTY))) {
         *phost = NULL;
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
     return flags;
 }
 
-int probe_access_flags(CPUArchState *env, target_ulong addr,
-                       MMUAccessType access_type, int mmu_idx,
-                       bool nonfault, void **phost, uintptr_t retaddr)
+int probe_access_full(CPUArchState *env, target_ulong addr,
+                      MMUAccessType access_type, int mmu_idx,
+                      bool nonfault, void **phost, CPUTLBEntryFull **pfull,
+                      uintptr_t retaddr)
 {
-    int flags;
-
-    flags = probe_access_internal(env, addr, 0, access_type, mmu_idx,
-                                  nonfault, phost, retaddr);
+    int flags = probe_access_internal(env, addr, 0, access_type, mmu_idx,
+                                      nonfault, phost, pfull, retaddr);
 
     /* Handle clean RAM pages.  */
     if (unlikely(flags & TLB_NOTDIRTY)) {
-        uintptr_t index = tlb_index(env, mmu_idx, addr);
-        CPUTLBEntryFull *full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
-
-        notdirty_write(env_cpu(env), addr, 1, full, retaddr);
+        notdirty_write(env_cpu(env), addr, 1, *pfull, retaddr);
         flags &= ~TLB_NOTDIRTY;
     }
 
     return flags;
 }
 
+int probe_access_flags(CPUArchState *env, target_ulong addr,
+                       MMUAccessType access_type, int mmu_idx,
+                       bool nonfault, void **phost, uintptr_t retaddr)
+{
+    CPUTLBEntryFull *full;
+
+    return probe_access_full(env, addr, access_type, mmu_idx,
+                             nonfault, phost, &full, retaddr);
+}
+
 void *probe_access(CPUArchState *env, target_ulong addr, int size,
                    MMUAccessType access_type, int mmu_idx, uintptr_t retaddr)
 {
+    CPUTLBEntryFull *full;
     void *host;
     int flags;
 
     g_assert(-(addr | TARGET_PAGE_MASK) >= size);
 
     flags = probe_access_internal(env, addr, size, access_type, mmu_idx,
-                                  false, &host, retaddr);
+                                  false, &host, &full, retaddr);
 
     /* Per the interface, size == 0 merely faults the access. */
     if (size == 0) {
@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
     }
 
     if (unlikely(flags & (TLB_NOTDIRTY | TLB_WATCHPOINT))) {
-        uintptr_t index = tlb_index(env, mmu_idx, addr);
-        CPUTLBEntryFull *full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
-
         /* Handle watchpoints.  */
         if (flags & TLB_WATCHPOINT) {
             int wp_access = (access_type == MMU_DATA_STORE
@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
 void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
                         MMUAccessType access_type, int mmu_idx)
 {
+    CPUTLBEntryFull *full;
     void *host;
     int flags;
 
     flags = probe_access_internal(env, addr, 0, access_type,
-                                  mmu_idx, true, &host, 0);
+                                  mmu_idx, true, &host, &full, 0);
 
     /* No combination of flags are expected by the caller. */
     return flags ? NULL : host;
@@ -XXX,XX +XXX,XX @@ void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
 tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
                                         void **hostp)
 {
+    CPUTLBEntryFull *full;
     void *p;
 
     (void)probe_access_internal(env, addr, 1, MMU_INST_FETCH,
-                                cpu_mmu_index(env, true), false, &p, 0);
+                                cpu_mmu_index(env, true), false, &p, &full, 0);
     if (p == NULL) {
         return -1;
     }
-- 
2.34.1

Now that we have collected all of the page data into
CPUTLBEntryFull, provide an interface to record that
all in one go, instead of using 4 arguments.  This interface
allows CPUTLBEntryFull to be extended without having to
change the number of arguments.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-defs.h | 14 +++++++++++
 include/exec/exec-all.h | 22 ++++++++++++++++++
 accel/tcg/cputlb.c      | 51 ++++++++++++++++++++++++++---------------
 3 files changed, 69 insertions(+), 18 deletions(-)

diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -XXX,XX +XXX,XX @@ typedef struct CPUTLBEntryFull {
      *     + the offset within the target MemoryRegion (otherwise)
      */
     hwaddr xlat_section;
+
+    /*
+     * @phys_addr contains the physical address in the address space
+     * given by cpu_asidx_from_attrs(cpu, @attrs).
+     */
+    hwaddr phys_addr;
+
+    /* @attrs contains the memory transaction attributes for the page. */
     MemTxAttrs attrs;
+
+    /* @prot contains the complete protections for the page. */
+    uint8_t prot;
+
+    /* @lg_page_size contains the log2 of the page size. */
+    uint8_t lg_page_size;
 } CPUTLBEntryFull;
 
 /*
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ void tlb_flush_range_by_mmuidx_all_cpus_synced(CPUState *cpu,
                                                uint16_t idxmap,
                                                unsigned bits);
 
+/**
+ * tlb_set_page_full:
+ * @cpu: CPU context
+ * @mmu_idx: mmu index of the tlb to modify
+ * @vaddr: virtual address of the entry to add
+ * @full: the details of the tlb entry
+ *
+ * Add an entry to @cpu tlb index @mmu_idx.  All of the fields of
+ * @full must be filled, except for xlat_section, and constitute
+ * the complete description of the translated page.
+ *
+ * This is generally called by the target tlb_fill function after
+ * having performed a successful page table walk to find the physical
+ * address and attributes for the translation.
+ *
+ * At most one entry for a given virtual address is permitted. Only a
+ * single TARGET_PAGE_SIZE region is mapped; @full->lg_page_size is only
+ * used by tlb_flush_page.
+ */
+void tlb_set_page_full(CPUState *cpu, int mmu_idx, target_ulong vaddr,
+                       CPUTLBEntryFull *full);
+
 /**
  * tlb_set_page_with_attrs:
  * @cpu: CPU to add this TLB entry for
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_add_large_page(CPUArchState *env, int mmu_idx,
     env_tlb(env)->d[mmu_idx].large_page_mask = lp_mask;
 }
 
-/* Add a new TLB entry. At most one entry for a given virtual address
+/*
+ * Add a new TLB entry. At most one entry for a given virtual address
  * is permitted. Only a single TARGET_PAGE_SIZE region is mapped, the
  * supplied size is only used by tlb_flush_page.
  *
  * Called from TCG-generated code, which is under an RCU read-side
  * critical section.
  */
-void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-                             hwaddr paddr, MemTxAttrs attrs, int prot,
-                             int mmu_idx, target_ulong size)
+void tlb_set_page_full(CPUState *cpu, int mmu_idx,
+                       target_ulong vaddr, CPUTLBEntryFull *full)
 {
     CPUArchState *env = cpu->env_ptr;
     CPUTLB *tlb = env_tlb(env);
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     CPUTLBEntry *te, tn;
     hwaddr iotlb, xlat, sz, paddr_page;
     target_ulong vaddr_page;
-    int asidx = cpu_asidx_from_attrs(cpu, attrs);
-    int wp_flags;
+    int asidx, wp_flags, prot;
     bool is_ram, is_romd;
 
     assert_cpu_is_self(cpu);
 
-    if (size <= TARGET_PAGE_SIZE) {
+    if (full->lg_page_size <= TARGET_PAGE_BITS) {
         sz = TARGET_PAGE_SIZE;
     } else {
-        tlb_add_large_page(env, mmu_idx, vaddr, size);
-        sz = size;
+        sz = (hwaddr)1 << full->lg_page_size;
+        tlb_add_large_page(env, mmu_idx, vaddr, sz);
     }
     vaddr_page = vaddr & TARGET_PAGE_MASK;
-    paddr_page = paddr & TARGET_PAGE_MASK;
+    paddr_page = full->phys_addr & TARGET_PAGE_MASK;
 
+    prot = full->prot;
+    asidx = cpu_asidx_from_attrs(cpu, full->attrs);
     section = address_space_translate_for_iotlb(cpu, asidx, paddr_page,
-                                                &xlat, &sz, attrs, &prot);
+                                                &xlat, &sz, full->attrs, &prot);
     assert(sz >= TARGET_PAGE_SIZE);
 
     tlb_debug("vaddr=" TARGET_FMT_lx " paddr=0x" TARGET_FMT_plx
               " prot=%x idx=%d\n",
-              vaddr, paddr, prot, mmu_idx);
+              vaddr, full->phys_addr, prot, mmu_idx);
 
     address = vaddr_page;
-    if (size < TARGET_PAGE_SIZE) {
+    if (full->lg_page_size < TARGET_PAGE_BITS) {
         /* Repeat the MMU check and TLB fill on every access.  */
         address |= TLB_INVALID_MASK;
     }
-    if (attrs.byte_swap) {
+    if (full->attrs.byte_swap) {
         address |= TLB_BSWAP;
     }
 
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
      * subtract here is that of the page base, and not the same as the
      * vaddr we add back in io_readx()/io_writex()/get_page_addr_code().
      */
+    desc->fulltlb[index] = *full;
     desc->fulltlb[index].xlat_section = iotlb - vaddr_page;
-    desc->fulltlb[index].attrs = attrs;
+    desc->fulltlb[index].phys_addr = paddr_page;
+    desc->fulltlb[index].prot = prot;
 
     /* Now calculate the new entry */
     tn.addend = addend - vaddr_page;
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     qemu_spin_unlock(&tlb->c.lock);
 }
 
-/* Add a new TLB entry, but without specifying the memory
- * transaction attributes to be used.
- */
+void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
+                             hwaddr paddr, MemTxAttrs attrs, int prot,
+                             int mmu_idx, target_ulong size)
+{
+    CPUTLBEntryFull full = {
+        .phys_addr = paddr,
+        .attrs = attrs,
+        .prot = prot,
+        .lg_page_size = ctz64(size)
+    };
+
+    assert(is_power_of_2(size));
+    tlb_set_page_full(cpu, mmu_idx, vaddr, &full);
+}
+
 void tlb_set_page(CPUState *cpu, target_ulong vaddr,
                   hwaddr paddr, int prot,
                   int mmu_idx, target_ulong size)
-- 
2.34.1

This bitmap is created and discarded immediately.
We gain nothing by its existence.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20220822232338.1727934-2-richard.henderson@linaro.org>
---
 accel/tcg/translate-all.c | 78 ++-------------------------------------
 1 file changed, 4 insertions(+), 74 deletions(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@
 #define assert_memory_lock() tcg_debug_assert(have_mmap_lock())
 #endif
 
-#define SMC_BITMAP_USE_THRESHOLD 10
-
 typedef struct PageDesc {
     /* list of TBs intersecting this ram page */
     uintptr_t first_tb;
-#ifdef CONFIG_SOFTMMU
-    /* in order to optimize self modifying code, we count the number
-       of lookups we do to a given page to use a bitmap */
-    unsigned long *code_bitmap;
-    unsigned int code_write_count;
-#else
+#ifdef CONFIG_USER_ONLY
     unsigned long flags;
     void *target_data;
 #endif
-#ifndef CONFIG_USER_ONLY
+#ifdef CONFIG_SOFTMMU
     QemuSpin lock;
 #endif
 } PageDesc;
@@ -XXX,XX +XXX,XX @@ void tb_htable_init(void)
     qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode);
 }
 
-/* call with @p->lock held */
-static inline void invalidate_page_bitmap(PageDesc *p)
-{
-    assert_page_locked(p);
-#ifdef CONFIG_SOFTMMU
-    g_free(p->code_bitmap);
-    p->code_bitmap = NULL;
-    p->code_write_count = 0;
-#endif
-}
-
 /* Set to NULL all the 'first_tb' fields in all PageDescs. */
 static void page_flush_tb_1(int level, void **lp)
 {
@@ -XXX,XX +XXX,XX @@ static void page_flush_tb_1(int level, void **lp)
         for (i = 0; i < V_L2_SIZE; ++i) {
             page_lock(&pd[i]);
             pd[i].first_tb = (uintptr_t)NULL;
-            invalidate_page_bitmap(pd + i);
             page_unlock(&pd[i]);
         }
     } else {
@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
     if (rm_from_page_list) {
         p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
         tb_page_remove(p, tb);
-        invalidate_page_bitmap(p);
         if (tb->page_addr[1] != -1) {
             p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
             tb_page_remove(p, tb);
-            invalidate_page_bitmap(p);
         }
     }
 
@@ -XXX,XX +XXX,XX @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
     }
 }
 
-#ifdef CONFIG_SOFTMMU
-/* call with @p->lock held */
-static void build_page_bitmap(PageDesc *p)
-{
-    int n, tb_start, tb_end;
-    TranslationBlock *tb;
-
-    assert_page_locked(p);
-    p->code_bitmap = bitmap_new(TARGET_PAGE_SIZE);
-
-    PAGE_FOR_EACH_TB(p, tb, n) {
-        /* NOTE: this is subtle as a TB may span two physical pages */
-        if (n == 0) {
-            /* NOTE: tb_end may be after the end of the page, but
-               it is not a problem */
-            tb_start = tb->pc & ~TARGET_PAGE_MASK;
-            tb_end = tb_start + tb->size;
-            if (tb_end > TARGET_PAGE_SIZE) {
-                tb_end = TARGET_PAGE_SIZE;
-             }
-        } else {
-            tb_start = 0;
-            tb_end = ((tb->pc + tb->size) & ~TARGET_PAGE_MASK);
-        }
-        bitmap_set(p->code_bitmap, tb_start, tb_end - tb_start);
-    }
-}
-#endif
-
 /* add the tb in the target page and protect it if necessary
  *
  * Called with mmap_lock held for user-mode emulation.
@@ -XXX,XX +XXX,XX @@ static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
     page_already_protected = p->first_tb != (uintptr_t)NULL;
 #endif
     p->first_tb = (uintptr_t)tb | n;
-    invalidate_page_bitmap(p);
 
 #if defined(CONFIG_USER_ONLY)
     /* translator_loop() must have made all TB pages non-writable */
@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
     /* remove TB from the page(s) if we couldn't insert it */
     if (unlikely(existing_tb)) {
         tb_page_remove(p, tb);
-        invalidate_page_bitmap(p);
         if (p2) {
             tb_page_remove(p2, tb);
-            invalidate_page_bitmap(p2);
         }
         tb = existing_tb;
     }
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
 #if !defined(CONFIG_USER_ONLY)
     /* if no code remaining, no need to continue to use slow writes */
     if (!p->first_tb) {
-        invalidate_page_bitmap(p);
         tlb_unprotect_code(start);
     }
 #endif
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
     }
 
     assert_page_locked(p);
-    if (!p->code_bitmap &&
-        ++p->code_write_count >= SMC_BITMAP_USE_THRESHOLD) {
-        build_page_bitmap(p);
-    }
-    if (p->code_bitmap) {
-        unsigned int nr;
-        unsigned long b;
-
-        nr = start & ~TARGET_PAGE_MASK;
-        b = p->code_bitmap[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG - 1));
-        if (b & ((1 << len) - 1)) {
-            goto do_invalidate;
-        }
-    } else {
-    do_invalidate:
-        tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
-                                              retaddr);
-    }
+    tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
+                                          retaddr);
 }
 #else
 /* Called with mmap_lock held. If pc is not 0 then it indicates the
-- 
2.34.1

Bool is more appropriate type for the alloc parameter.

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ void page_init(void)
 #endif
 }
 
-static PageDesc *page_find_alloc(tb_page_addr_t index, int alloc)
+static PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
 {
     PageDesc *pd;
     void **lp;
@@ -XXX,XX +XXX,XX @@ static PageDesc *page_find_alloc(tb_page_addr_t index, int alloc)
 
 static inline PageDesc *page_find(tb_page_addr_t index)
 {
-    return page_find_alloc(index, 0);
+    return page_find_alloc(index, false);
 }
 
 static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
-                           PageDesc **ret_p2, tb_page_addr_t phys2, int alloc);
+                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc);
 
 /* In user-mode page locks aren't used; mmap_lock is enough */
 #ifdef CONFIG_USER_ONLY
@@ -XXX,XX +XXX,XX @@ static inline void page_unlock(PageDesc *pd)
 /* lock the page(s) of a TB in the correct acquisition order */
 static inline void page_lock_tb(const TranslationBlock *tb)
 {
-    page_lock_pair(NULL, tb->page_addr[0], NULL, tb->page_addr[1], 0);
+    page_lock_pair(NULL, tb->page_addr[0], NULL, tb->page_addr[1], false);
 }
 
 static inline void page_unlock_tb(const TranslationBlock *tb)
@@ -XXX,XX +XXX,XX @@ void page_collection_unlock(struct page_collection *set)
 #endif /* !CONFIG_USER_ONLY */
 
 static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
-                           PageDesc **ret_p2, tb_page_addr_t phys2, int alloc)
+                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc)
 {
     PageDesc *p1, *p2;
     tb_page_addr_t page1;
@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
      * Note that inserting into the hash table first isn't an option, since
      * we can only insert TBs that are fully initialized.
      */
-    page_lock_pair(&p, phys_pc, &p2, phys_page2, 1);
+    page_lock_pair(&p, phys_pc, &p2, phys_page2, true);
     tb_page_add(p, tb, 0, phys_pc & TARGET_PAGE_MASK);
     if (p2) {
         tb_page_add(p2, tb, 1, phys_page2);
@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
     for (addr = start, len = end - start;
          len != 0;
          len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
-        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, 1);
+        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, true);
 
         /* If the write protection bit is set, then we invalidate
            the code inside.  */
-- 
2.34.1

Use the pc coming from db->pc_first rather than the TB.

Use the cached host_addr rather than re-computing for the
first page.  We still need a separate lookup for the second
page because it won't be computed for DisasContextBase until
the translator actually performs a read from the page.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/plugin-gen.h |  7 ++++---
 accel/tcg/plugin-gen.c    | 22 +++++++++++-----------
 accel/tcg/translator.c    |  2 +-
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/plugin-gen.h
+++ b/include/exec/plugin-gen.h
@@ -XXX,XX +XXX,XX @@ struct DisasContextBase;
 
 #ifdef CONFIG_PLUGIN
 
-bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress);
+bool plugin_gen_tb_start(CPUState *cpu, const struct DisasContextBase *db,
+                         bool supress);
 void plugin_gen_tb_end(CPUState *cpu);
 void plugin_gen_insn_start(CPUState *cpu, const struct DisasContextBase *db);
 void plugin_gen_insn_end(void);
@@ -XXX,XX +XXX,XX @@ static inline void plugin_insn_append(abi_ptr pc, const void *from, size_t size)
 
 #else /* !CONFIG_PLUGIN */
 
-static inline
-bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress)
+static inline bool
+plugin_gen_tb_start(CPUState *cpu, const struct DisasContextBase *db, bool sup)
 {
     return false;
 }
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@ static void plugin_gen_inject(const struct qemu_plugin_tb *plugin_tb)
     pr_ops();
 }
 
-bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool mem_only)
+bool plugin_gen_tb_start(CPUState *cpu, const DisasContextBase *db,
+                         bool mem_only)
 {
     bool ret = false;
 
@@ -XXX,XX +XXX,XX @@ bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool mem_onl
 
         ret = true;
 
-        ptb->vaddr = tb->pc;
+        ptb->vaddr = db->pc_first;
         ptb->vaddr2 = -1;
-        get_page_addr_code_hostp(cpu->env_ptr, tb->pc, &ptb->haddr1);
+        ptb->haddr1 = db->host_addr[0];
         ptb->haddr2 = NULL;
         ptb->mem_only = mem_only;
 
@@ -XXX,XX +XXX,XX @@ void plugin_gen_insn_start(CPUState *cpu, const DisasContextBase *db)
      * Note that we skip this when haddr1 == NULL, e.g. when we're
      * fetching instructions from a region not backed by RAM.
      */
-    if (likely(ptb->haddr1 != NULL && ptb->vaddr2 == -1) &&
-        unlikely((db->pc_next & TARGET_PAGE_MASK) !=
-                 (db->pc_first & TARGET_PAGE_MASK))) {
-        get_page_addr_code_hostp(cpu->env_ptr, db->pc_next,
-                                 &ptb->haddr2);
-        ptb->vaddr2 = db->pc_next;
-    }
-    if (likely(ptb->vaddr2 == -1)) {
+    if (ptb->haddr1 == NULL) {
+        pinsn->haddr = NULL;
+    } else if (is_same_page(db, db->pc_next)) {
         pinsn->haddr = ptb->haddr1 + pinsn->vaddr - ptb->vaddr;
     } else {
+        if (ptb->vaddr2 == -1) {
+            ptb->vaddr2 = TARGET_PAGE_ALIGN(db->pc_first);
+            get_page_addr_code_hostp(cpu->env_ptr, ptb->vaddr2, &ptb->haddr2);
+        }
         pinsn->haddr = ptb->haddr2 + pinsn->vaddr - ptb->vaddr2;
     }
 }
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int max_insns,
     ops->tb_start(db, cpu);
     tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
 
-    plugin_enabled = plugin_gen_tb_start(cpu, tb, cflags & CF_MEMI_ONLY);
+    plugin_enabled = plugin_gen_tb_start(cpu, db, cflags & CF_MEMI_ONLY);
 
     while (true) {
         db->num_insns++;
-- 
2.34.1

Let tb->page_addr[0] contain the address of the first byte of the
translated block, rather than the address of the page containing the
start of the translated block.  We need to recover this value anyway
at various points, and it is easier to discard a page offset when it
is not needed, which happens naturally via the existing find_page shift.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cpu-exec.c      | 16 ++++++++--------
 accel/tcg/cputlb.c        |  3 ++-
 accel/tcg/translate-all.c |  9 +++++----
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ struct tb_desc {
     target_ulong pc;
     target_ulong cs_base;
     CPUArchState *env;
-    tb_page_addr_t phys_page1;
+    tb_page_addr_t page_addr0;
     uint32_t flags;
     uint32_t cflags;
     uint32_t trace_vcpu_dstate;
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
     const struct tb_desc *desc = d;
 
     if (tb->pc == desc->pc &&
-        tb->page_addr[0] == desc->phys_page1 &&
+        tb->page_addr[0] == desc->page_addr0 &&
         tb->cs_base == desc->cs_base &&
         tb->flags == desc->flags &&
         tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
         if (tb->page_addr[1] == -1) {
             return true;
         } else {
-            tb_page_addr_t phys_page2;
-            target_ulong virt_page2;
+            tb_page_addr_t phys_page1;
+            target_ulong virt_page1;
 
             /*
              * We know that the first page matched, and an otherwise valid TB
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
              * is different for the new TB.  Therefore any exception raised
              * here by the faulting lookup is not premature.
              */
-            virt_page2 = TARGET_PAGE_ALIGN(desc->pc);
-            phys_page2 = get_page_addr_code(desc->env, virt_page2);
-            if (tb->page_addr[1] == phys_page2) {
+            virt_page1 = TARGET_PAGE_ALIGN(desc->pc);
+            phys_page1 = get_page_addr_code(desc->env, virt_page1);
+            if (tb->page_addr[1] == phys_page1) {
                 return true;
             }
         }
@@ -XXX,XX +XXX,XX @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
     if (phys_pc == -1) {
         return NULL;
     }
-    desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
+    desc.page_addr0 = phys_pc;
     h = tb_hash_func(phys_pc, pc, flags, cflags, *cpu->trace_dstate);
     return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);
 }
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
    can be detected */
 void tlb_protect_code(ram_addr_t ram_addr)
 {
-    cpu_physical_memory_test_and_clear_dirty(ram_addr, TARGET_PAGE_SIZE,
+    cpu_physical_memory_test_and_clear_dirty(ram_addr & TARGET_PAGE_MASK,
+                                             TARGET_PAGE_SIZE,
                                              DIRTY_MEMORY_CODE);
 }
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
     qemu_spin_unlock(&tb->jmp_lock);
 
     /* remove the TB from the hash list */
-    phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
+    phys_pc = tb->page_addr[0];
     h = tb_hash_func(phys_pc, tb->pc, tb->flags, orig_cflags,
                      tb->trace_vcpu_dstate);
     if (!qht_remove(&tb_ctx.htable, tb, h)) {
@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
      * we can only insert TBs that are fully initialized.
      */
     page_lock_pair(&p, phys_pc, &p2, phys_page2, true);
-    tb_page_add(p, tb, 0, phys_pc & TARGET_PAGE_MASK);
+    tb_page_add(p, tb, 0, phys_pc);
     if (p2) {
         tb_page_add(p2, tb, 1, phys_page2);
     } else {
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
         if (n == 0) {
             /* NOTE: tb_end may be after the end of the page, but
                it is not a problem */
-            tb_start = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
+            tb_start = tb->page_addr[0];
             tb_end = tb_start + tb->size;
         } else {
             tb_start = tb->page_addr[1];
-            tb_end = tb_start + ((tb->pc + tb->size) & ~TARGET_PAGE_MASK);
+            tb_end = tb_start + ((tb->page_addr[0] + tb->size)
+                                 & ~TARGET_PAGE_MASK);
         }
         if (!(tb_end <= start || tb_start >= end)) {
 #ifdef TARGET_HAS_PRECISE_SMC
-- 
2.34.1

This function has two users, who use it incompatibly.
In tlb_flush_page_by_mmuidx_async_0, when flushing a
single page, we need to flush exactly two pages.
In tlb_flush_range_by_mmuidx_async_0, when flushing a
range of pages, we need to flush N+1 pages.

This avoids double-flushing of jmp cache pages in a range.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tb_jmp_cache_clear_page(CPUState *cpu, target_ulong page_addr)
     }
 }
 
-static void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr)
-{
-    /* Discard jump cache entries for any tb which might potentially
-       overlap the flushed page.  */
-    tb_jmp_cache_clear_page(cpu, addr - TARGET_PAGE_SIZE);
-    tb_jmp_cache_clear_page(cpu, addr);
-}
-
 /**
  * tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
  * @desc: The CPUTLBDesc portion of the TLB
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_by_mmuidx_async_0(CPUState *cpu,
     }
     qemu_spin_unlock(&env_tlb(env)->c.lock);
 
-    tb_flush_jmp_cache(cpu, addr);
+    /*
+     * Discard jump cache entries for any tb which might potentially
+     * overlap the flushed page, which includes the previous.
+     */
+    tb_jmp_cache_clear_page(cpu, addr - TARGET_PAGE_SIZE);
+    tb_jmp_cache_clear_page(cpu, addr);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_range_by_mmuidx_async_0(CPUState *cpu,
         return;
     }
 
-    for (target_ulong i = 0; i < d.len; i += TARGET_PAGE_SIZE) {
-        tb_flush_jmp_cache(cpu, d.addr + i);
+    /*
+     * Discard jump cache entries for any tb which might potentially
+     * overlap the flushed pages, which includes the previous.
+     */
+    d.addr -= TARGET_PAGE_SIZE;
+    for (target_ulong i = 0, n = d.len / TARGET_PAGE_SIZE + 1; i < n; i++) {
+        tb_jmp_cache_clear_page(cpu, d.addr);
+        d.addr += TARGET_PAGE_SIZE;
     }
 }
 
-- 
2.34.1

Wrap the bare TranslationBlock pointer into a structure.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tb-hash.h       |  1 +
 accel/tcg/tb-jmp-cache.h  | 24 ++++++++++++++++++++++++
 include/exec/cpu-common.h |  1 +
 include/hw/core/cpu.h     | 15 +--------------
 include/qemu/typedefs.h   |  1 +
 accel/stubs/tcg-stub.c    |  4 ++++
 accel/tcg/cpu-exec.c      | 10 +++++++---
 accel/tcg/cputlb.c        |  9 +++++----
 accel/tcg/translate-all.c | 28 +++++++++++++++++++++++++---
 hw/core/cpu-common.c      |  3 +--
 plugins/core.c            |  2 +-
 trace/control-target.c    |  2 +-
 12 files changed, 72 insertions(+), 28 deletions(-)
 create mode 100644 accel/tcg/tb-jmp-cache.h

diff --git a/accel/tcg/tb-hash.h b/accel/tcg/tb-hash.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tb-hash.h
+++ b/accel/tcg/tb-hash.h
@@ -XXX,XX +XXX,XX @@
 #include "exec/cpu-defs.h"
 #include "exec/exec-all.h"
 #include "qemu/xxhash.h"
+#include "tb-jmp-cache.h"
 
 #ifdef CONFIG_SOFTMMU
 
diff --git a/accel/tcg/tb-jmp-cache.h b/accel/tcg/tb-jmp-cache.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tb-jmp-cache.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * The per-CPU TranslationBlock jump cache.
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef ACCEL_TCG_TB_JMP_CACHE_H
+#define ACCEL_TCG_TB_JMP_CACHE_H
+
+#define TB_JMP_CACHE_BITS 12
+#define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
+
+/*
+ * Accessed in parallel; all accesses to 'tb' must be atomic.
+ */
+struct CPUJumpCache {
+    struct {
+        TranslationBlock *tb;
+    } array[TB_JMP_CACHE_SIZE];
+};
+
+#endif /* ACCEL_TCG_TB_JMP_CACHE_H */
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -XXX,XX +XXX,XX @@ void cpu_list_unlock(void);
 unsigned int cpu_list_generation_id_get(void);
 
 void tcg_flush_softmmu_tlb(CPUState *cs);
+void tcg_flush_jmp_cache(CPUState *cs);
 
 void tcg_iommu_init_notifier_list(CPUState *cpu);
 void tcg_iommu_free_notifier_list(CPUState *cpu);
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ struct kvm_run;
 struct hax_vcpu_state;
 struct hvf_vcpu_state;
 
-#define TB_JMP_CACHE_BITS 12
-#define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
-
 /* work queue */
 
 /* The union type allows passing of 64 bit target pointers on 32 bit
@@ -XXX,XX +XXX,XX @@ struct CPUState {
     CPUArchState *env_ptr;
     IcountDecr *icount_decr_ptr;
 
-    /* Accessed in parallel; all accesses must be atomic */
-    TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE];
+    CPUJumpCache *tb_jmp_cache;
 
     struct GDBRegisterState *gdb_regs;
     int gdb_num_regs;
@@ -XXX,XX +XXX,XX @@ extern CPUTailQ cpus;
 
 extern __thread CPUState *current_cpu;
 
-static inline void cpu_tb_jmp_cache_clear(CPUState *cpu)
-{
-    unsigned int i;
-
-    for (i = 0; i < TB_JMP_CACHE_SIZE; i++) {
-        qatomic_set(&cpu->tb_jmp_cache[i], NULL);
-    }
-}
-
 /**
  * qemu_tcg_mttcg_enabled:
  * Check whether we are running MultiThread TCG or not.
diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -XXX,XX +XXX,XX @@ typedef struct CoMutex CoMutex;
 typedef struct ConfidentialGuestSupport ConfidentialGuestSupport;
 typedef struct CPUAddressSpace CPUAddressSpace;
 typedef struct CPUArchState CPUArchState;
+typedef struct CPUJumpCache CPUJumpCache;
 typedef struct CPUState CPUState;
 typedef struct CPUTLBEntryFull CPUTLBEntryFull;
 typedef struct DeviceListener DeviceListener;
diff --git a/accel/stubs/tcg-stub.c b/accel/stubs/tcg-stub.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/stubs/tcg-stub.c
+++ b/accel/stubs/tcg-stub.c
@@ -XXX,XX +XXX,XX @@ void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
 {
 }
 
+void tcg_flush_jmp_cache(CPUState *cpu)
+{
+}
+
 int probe_access_flags(CPUArchState *env, target_ulong addr,
                        MMUAccessType access_type, int mmu_idx,
                        bool nonfault, void **phost, uintptr_t retaddr)
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/replay.h"
 #include "sysemu/tcg.h"
 #include "exec/helper-proto.h"
+#include "tb-jmp-cache.h"
 #include "tb-hash.h"
 #include "tb-context.h"
 #include "internal.h"
@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
     tcg_debug_assert(!(cflags & CF_INVALID));
 
     hash = tb_jmp_cache_hash_func(pc);
-    tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
+    tb = qatomic_rcu_read(&cpu->tb_jmp_cache->array[hash].tb);
 
     if (likely(tb &&
                tb->pc == pc &&
@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
     if (tb == NULL) {
         return NULL;
     }
-    qatomic_set(&cpu->tb_jmp_cache[hash], tb);
+    qatomic_set(&cpu->tb_jmp_cache->array[hash].tb, tb);
     return tb;
 }
 
@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
 
             tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
             if (tb == NULL) {
+                uint32_t h;
+
                 mmap_lock();
                 tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
                 mmap_unlock();
@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
                  * We add the TB in the virtual pc hash table
                  * for the fast lookup
                  */
-                qatomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
+                h = tb_jmp_cache_hash_func(pc);
+                qatomic_set(&cpu->tb_jmp_cache->array[h].tb, tb);
             }
 
 #ifndef CONFIG_USER_ONLY
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,
 
 static void tb_jmp_cache_clear_page(CPUState *cpu, target_ulong page_addr)
 {
-    unsigned int i, i0 = tb_jmp_cache_hash_page(page_addr);
+    int i, i0 = tb_jmp_cache_hash_page(page_addr);
+    CPUJumpCache *jc = cpu->tb_jmp_cache;
 
     for (i = 0; i < TB_JMP_PAGE_SIZE; i++) {
-        qatomic_set(&cpu->tb_jmp_cache[i0 + i], NULL);
+        qatomic_set(&jc->array[i0 + i].tb, NULL);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 
     qemu_spin_unlock(&env_tlb(env)->c.lock);
 
-    cpu_tb_jmp_cache_clear(cpu);
+    tcg_flush_jmp_cache(cpu);
 
     if (to_clean == ALL_MMUIDX_BITS) {
         qatomic_set(&env_tlb(env)->c.full_flush_count,
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_range_by_mmuidx_async_0(CPUState *cpu,
      * longer to clear each entry individually than it will to clear it all.
      */
     if (d.len >= (TARGET_PAGE_SIZE * TB_JMP_CACHE_SIZE)) {
-        cpu_tb_jmp_cache_clear(cpu);
+        tcg_flush_jmp_cache(cpu);
         return;
     }
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/tcg.h"
 #include "qapi/error.h"
 #include "hw/core/tcg-cpu-ops.h"
+#include "tb-jmp-cache.h"
 #include "tb-hash.h"
 #include "tb-context.h"
 #include "internal.h"
@@ -XXX,XX +XXX,XX @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
     }
 
     CPU_FOREACH(cpu) {
-        cpu_tb_jmp_cache_clear(cpu);
+        tcg_flush_jmp_cache(cpu);
     }
 
     qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
     /* remove the TB from the hash list */
     h = tb_jmp_cache_hash_func(tb->pc);
     CPU_FOREACH(cpu) {
-        if (qatomic_read(&cpu->tb_jmp_cache[h]) == tb) {
-            qatomic_set(&cpu->tb_jmp_cache[h], NULL);
+        CPUJumpCache *jc = cpu->tb_jmp_cache;
+        if (qatomic_read(&jc->array[h].tb) == tb) {
+            qatomic_set(&jc->array[h].tb, NULL);
         }
     }
 
@@ -XXX,XX +XXX,XX @@ int page_unprotect(target_ulong address, uintptr_t pc)
 }
 #endif /* CONFIG_USER_ONLY */
 
+/*
+ * Called by generic code at e.g. cpu reset after cpu creation,
+ * therefore we must be prepared to allocate the jump cache.
+ */
+void tcg_flush_jmp_cache(CPUState *cpu)
+{
+    CPUJumpCache *jc = cpu->tb_jmp_cache;
+
+    if (likely(jc)) {
+        for (int i = 0; i < TB_JMP_CACHE_SIZE; i++) {
+            qatomic_set(&jc->array[i].tb, NULL);
+        }
+    } else {
+        /* This should happen once during realize, and thus never race. */
+        jc = g_new0(CPUJumpCache, 1);
+        jc = qatomic_xchg(&cpu->tb_jmp_cache, jc);
+        assert(jc == NULL);
+    }
+}
+
 /* This is a wrapper for common code that can not use CONFIG_SOFTMMU */
 void tcg_flush_softmmu_tlb(CPUState *cs)
 {
diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/core/cpu-common.c
+++ b/hw/core/cpu-common.c
@@ -XXX,XX +XXX,XX @@ static void cpu_common_reset(DeviceState *dev)
     cpu->cflags_next_tb = -1;
 
     if (tcg_enabled()) {
-        cpu_tb_jmp_cache_clear(cpu);
-
+        tcg_flush_jmp_cache(cpu);
         tcg_flush_softmmu_tlb(cpu);
     }
 }
diff --git a/plugins/core.c b/plugins/core.c
index XXXXXXX..XXXXXXX 100644
--- a/plugins/core.c
+++ b/plugins/core.c
@@ -XXX,XX +XXX,XX @@ struct qemu_plugin_ctx *plugin_id_to_ctx_locked(qemu_plugin_id_t id)
 static void plugin_cpu_update__async(CPUState *cpu, run_on_cpu_data data)
 {
     bitmap_copy(cpu->plugin_mask, &data.host_ulong, QEMU_PLUGIN_EV_MAX);
-    cpu_tb_jmp_cache_clear(cpu);
+    tcg_flush_jmp_cache(cpu);
 }
 
 static void plugin_cpu_update__locked(gpointer k, gpointer v, gpointer udata)
diff --git a/trace/control-target.c b/trace/control-target.c
index XXXXXXX..XXXXXXX 100644
--- a/trace/control-target.c
+++ b/trace/control-target.c
@@ -XXX,XX +XXX,XX @@ static void trace_event_synchronize_vcpu_state_dynamic(
 {
     bitmap_copy(vcpu->trace_dstate, vcpu->trace_dstate_delayed,
                 CPU_TRACE_DSTATE_MAX_EVENTS);
-    cpu_tb_jmp_cache_clear(vcpu);
+    tcg_flush_jmp_cache(vcpu);
 }
 
 void trace_event_set_vcpu_state_dynamic(CPUState *vcpu,
-- 
2.34.1

Populate this new method for all targets.  Always match
the result that would be given by cpu_get_tb_cpu_state,
as we will want these values to correspond in the logs.

Reviewed-by: Taylor Simpson <tsimpson@quicinc.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> (target/sparc)
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
Cc: Eduardo Habkost <eduardo@habkost.net> (supporter:Machine core)
Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com> (supporter:Machine core)
Cc: "Philippe Mathieu-Daudé" <f4bug@amsat.org> (reviewer:Machine core)
Cc: Yanan Wang <wangyanan55@huawei.com> (reviewer:Machine core)
Cc: Michael Rolnik <mrolnik@gmail.com> (maintainer:AVR TCG CPUs)
Cc: "Edgar E. Iglesias" <edgar.iglesias@gmail.com> (maintainer:CRIS TCG CPUs)
Cc: Taylor Simpson <tsimpson@quicinc.com> (supporter:Hexagon TCG CPUs)
Cc: Song Gao <gaosong@loongson.cn> (maintainer:LoongArch TCG CPUs)
Cc: Xiaojuan Yang <yangxiaojuan@loongson.cn> (maintainer:LoongArch TCG CPUs)
Cc: Laurent Vivier <laurent@vivier.eu> (maintainer:M68K TCG CPUs)
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> (reviewer:MIPS TCG CPUs)
Cc: Aleksandar Rikalo <aleksandar.rikalo@syrmia.com> (reviewer:MIPS TCG CPUs)
Cc: Chris Wulff <crwulff@gmail.com> (maintainer:NiosII TCG CPUs)
Cc: Marek Vasut <marex@denx.de> (maintainer:NiosII TCG CPUs)
Cc: Stafford Horne <shorne@gmail.com> (odd fixer:OpenRISC TCG CPUs)
Cc: Yoshinori Sato <ysato@users.sourceforge.jp> (reviewer:RENESAS RX CPUs)
Cc: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> (maintainer:SPARC TCG CPUs)
Cc: Bastian Koppelmann <kbastian@mail.uni-paderborn.de> (maintainer:TriCore TCG CPUs)
Cc: Max Filippov <jcmvbkbc@gmail.com> (maintainer:Xtensa TCG CPUs)
Cc: qemu-arm@nongnu.org (open list:ARM TCG CPUs)
Cc: qemu-ppc@nongnu.org (open list:PowerPC TCG CPUs)
Cc: qemu-riscv@nongnu.org (open list:RISC-V TCG CPUs)
Cc: qemu-s390x@nongnu.org (open list:S390 TCG CPUs)
---
 include/hw/core/cpu.h   |  3 +++
 target/alpha/cpu.c      |  9 +++++++++
 target/arm/cpu.c        | 13 +++++++++++++
 target/avr/cpu.c        |  8 ++++++++
 target/cris/cpu.c       |  8 ++++++++
 target/hexagon/cpu.c    |  8 ++++++++
 target/hppa/cpu.c       |  8 ++++++++
 target/i386/cpu.c       |  9 +++++++++
 target/loongarch/cpu.c  |  9 +++++++++
 target/m68k/cpu.c       |  8 ++++++++
 target/microblaze/cpu.c |  8 ++++++++
 target/mips/cpu.c       |  8 ++++++++
 target/nios2/cpu.c      |  9 +++++++++
 target/openrisc/cpu.c   |  8 ++++++++
 target/ppc/cpu_init.c   |  8 ++++++++
 target/riscv/cpu.c      | 13 +++++++++++++
 target/rx/cpu.c         |  8 ++++++++
 target/s390x/cpu.c      |  8 ++++++++
 target/sh4/cpu.c        |  8 ++++++++
 target/sparc/cpu.c      |  8 ++++++++
 target/tricore/cpu.c    |  9 +++++++++
 target/xtensa/cpu.c     |  8 ++++++++
 22 files changed, 186 insertions(+)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ struct SysemuCPUOps;
  *       If the target behaviour here is anything other than "set
  *       the PC register to the value passed in" then the target must
  *       also implement the synchronize_from_tb hook.
+ * @get_pc: Callback for getting the Program Counter register.
+ *       As above, with the semantics of the target architecture.
  * @gdb_read_register: Callback for letting GDB read a register.
  * @gdb_write_register: Callback for letting GDB write a register.
  * @gdb_adjust_breakpoint: Callback for adjusting the address of a
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
     void (*dump_state)(CPUState *cpu, FILE *, int flags);
     int64_t (*get_arch_id)(CPUState *cpu);
     void (*set_pc)(CPUState *cpu, vaddr value);
+    vaddr (*get_pc)(CPUState *cpu);
     int (*gdb_read_register)(CPUState *cpu, GByteArray *buf, int reg);
     int (*gdb_write_register)(CPUState *cpu, uint8_t *buf, int reg);
     vaddr (*gdb_adjust_breakpoint)(CPUState *cpu, vaddr addr);
diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.pc = value;
 }
 
+static vaddr alpha_cpu_get_pc(CPUState *cs)
+{
+    AlphaCPU *cpu = ALPHA_CPU(cs);
+
+    return cpu->env.pc;
+}
+
+
 static bool alpha_cpu_has_work(CPUState *cs)
 {
     /* Here we are checking to see if the CPU should wake up from HALT.
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
     cc->has_work = alpha_cpu_has_work;
     cc->dump_state = alpha_cpu_dump_state;
     cc->set_pc = alpha_cpu_set_pc;
+    cc->get_pc = alpha_cpu_get_pc;
     cc->gdb_read_register = alpha_cpu_gdb_read_register;
     cc->gdb_write_register = alpha_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_set_pc(CPUState *cs, vaddr value)
     }
 }
 
+static vaddr arm_cpu_get_pc(CPUState *cs)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
+
+    if (is_a64(env)) {
+        return env->pc;
+    } else {
+        return env->regs[15];
+    }
+}
+
 #ifdef CONFIG_TCG
 void arm_cpu_synchronize_from_tb(CPUState *cs,
                                  const TranslationBlock *tb)
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
     cc->has_work = arm_cpu_has_work;
     cc->dump_state = arm_cpu_dump_state;
     cc->set_pc = arm_cpu_set_pc;
+    cc->get_pc = arm_cpu_get_pc;
     cc->gdb_read_register = arm_cpu_gdb_read_register;
     cc->gdb_write_register = arm_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
diff --git a/target/avr/cpu.c b/target/avr/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/cpu.c
+++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.pc_w = value / 2; /* internally PC points to words */
 }
 
+static vaddr avr_cpu_get_pc(CPUState *cs)
+{
+    AVRCPU *cpu = AVR_CPU(cs);
+
+    return cpu->env.pc_w * 2;
+}
+
 static bool avr_cpu_has_work(CPUState *cs)
 {
     AVRCPU *cpu = AVR_CPU(cs);
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
     cc->has_work = avr_cpu_has_work;
     cc->dump_state = avr_cpu_dump_state;
     cc->set_pc = avr_cpu_set_pc;
+    cc->get_pc = avr_cpu_get_pc;
     dc->vmsd = &vms_avr_cpu;
     cc->sysemu_ops = &avr_sysemu_ops;
     cc->disas_set_info = avr_cpu_disas_set_info;
diff --git a/target/cris/cpu.c b/target/cris/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/cpu.c
+++ b/target/cris/cpu.c
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.pc = value;
 }
 
+static vaddr cris_cpu_get_pc(CPUState *cs)
+{
+    CRISCPU *cpu = CRIS_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static bool cris_cpu_has_work(CPUState *cs)
 {
     return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI);
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
     cc->has_work = cris_cpu_has_work;
     cc->dump_state = cris_cpu_dump_state;
     cc->set_pc = cris_cpu_set_pc;
+    cc->get_pc = cris_cpu_get_pc;
     cc->gdb_read_register = cris_cpu_gdb_read_register;
     cc->gdb_write_register = cris_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/cpu.c
+++ b/target/hexagon/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hexagon_cpu_set_pc(CPUState *cs, vaddr value)
     env->gpr[HEX_REG_PC] = value;
 }
 
+static vaddr hexagon_cpu_get_pc(CPUState *cs)
+{
+    HexagonCPU *cpu = HEXAGON_CPU(cs);
+    CPUHexagonState *env = &cpu->env;
+    return env->gpr[HEX_REG_PC];
+}
+
 static void hexagon_cpu_synchronize_from_tb(CPUState *cs,
                                             const TranslationBlock *tb)
 {
@@ -XXX,XX +XXX,XX @@ static void hexagon_cpu_class_init(ObjectClass *c, void *data)
     cc->has_work = hexagon_cpu_has_work;
     cc->dump_state = hexagon_dump_state;
     cc->set_pc = hexagon_cpu_set_pc;
+    cc->get_pc = hexagon_cpu_get_pc;
     cc->gdb_read_register = hexagon_gdb_read_register;
     cc->gdb_write_register = hexagon_gdb_write_register;
     cc->gdb_num_core_regs = TOTAL_PER_THREAD_REGS + NUM_VREGS + NUM_QREGS;
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.iaoq_b = value + 4;
 }
 
+static vaddr hppa_cpu_get_pc(CPUState *cs)
+{
+    HPPACPU *cpu = HPPA_CPU(cs);
+
+    return cpu->env.iaoq_f;
+}
+
 static void hppa_cpu_synchronize_from_tb(CPUState *cs,
                                          const TranslationBlock *tb)
 {
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
     cc->has_work = hppa_cpu_has_work;
     cc->dump_state = hppa_cpu_dump_state;
     cc->set_pc = hppa_cpu_set_pc;
+    cc->get_pc = hppa_cpu_get_pc;
     cc->gdb_read_register = hppa_cpu_gdb_read_register;
     cc->gdb_write_register = hppa_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -XXX,XX +XXX,XX @@ static void x86_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.eip = value;
 }
 
+static vaddr x86_cpu_get_pc(CPUState *cs)
+{
+    X86CPU *cpu = X86_CPU(cs);
+
+    /* Match cpu_get_tb_cpu_state. */
+    return cpu->env.eip + cpu->env.segs[R_CS].base;
+}
+
 int x86_cpu_pending_interrupt(CPUState *cs, int interrupt_request)
 {
     X86CPU *cpu = X86_CPU(cs);
@@ -XXX,XX +XXX,XX @@ static void x86_cpu_common_class_init(ObjectClass *oc, void *data)
     cc->has_work = x86_cpu_has_work;
     cc->dump_state = x86_cpu_dump_state;
     cc->set_pc = x86_cpu_set_pc;
+    cc->get_pc = x86_cpu_get_pc;
     cc->gdb_read_register = x86_cpu_gdb_read_register;
     cc->gdb_write_register = x86_cpu_gdb_write_register;
     cc->get_arch_id = x86_cpu_get_arch_id;
diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/cpu.c
+++ b/target/loongarch/cpu.c
@@ -XXX,XX +XXX,XX @@ static void loongarch_cpu_set_pc(CPUState *cs, vaddr value)
     env->pc = value;
 }
 
+static vaddr loongarch_cpu_get_pc(CPUState *cs)
+{
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    return env->pc;
+}
+
 #ifndef CONFIG_USER_ONLY
 #include "hw/loongarch/virt.h"
 
@@ -XXX,XX +XXX,XX @@ static void loongarch_cpu_class_init(ObjectClass *c, void *data)
     cc->has_work = loongarch_cpu_has_work;
     cc->dump_state = loongarch_cpu_dump_state;
     cc->set_pc = loongarch_cpu_set_pc;
+    cc->get_pc = loongarch_cpu_get_pc;
 #ifndef CONFIG_USER_ONLY
     dc->vmsd = &vmstate_loongarch_cpu;
     cc->sysemu_ops = &loongarch_sysemu_ops;
diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/cpu.c
+++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.pc = value;
 }
 
+static vaddr m68k_cpu_get_pc(CPUState *cs)
+{
+    M68kCPU *cpu = M68K_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static bool m68k_cpu_has_work(CPUState *cs)
 {
     return cs->interrupt_request & CPU_INTERRUPT_HARD;
@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
     cc->has_work = m68k_cpu_has_work;
     cc->dump_state = m68k_cpu_dump_state;
     cc->set_pc = m68k_cpu_set_pc;
+    cc->get_pc = m68k_cpu_get_pc;
     cc->gdb_read_register = m68k_cpu_gdb_read_register;
     cc->gdb_write_register = m68k_cpu_gdb_write_register;
 #if defined(CONFIG_SOFTMMU)
diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.iflags = 0;
 }
 
+static vaddr mb_cpu_get_pc(CPUState *cs)
+{
+    MicroBlazeCPU *cpu = MICROBLAZE_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static void mb_cpu_synchronize_from_tb(CPUState *cs,
                                        const TranslationBlock *tb)
 {
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->dump_state = mb_cpu_dump_state;
     cc->set_pc = mb_cpu_set_pc;
+    cc->get_pc = mb_cpu_get_pc;
     cc->gdb_read_register = mb_cpu_gdb_read_register;
     cc->gdb_write_register = mb_cpu_gdb_write_register;
 
diff --git a/target/mips/cpu.c b/target/mips/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_set_pc(CPUState *cs, vaddr value)
     mips_env_set_pc(&cpu->env, value);
 }
 
+static vaddr mips_cpu_get_pc(CPUState *cs)
+{
+    MIPSCPU *cpu = MIPS_CPU(cs);
+
+    return cpu->env.active_tc.PC;
+}
+
 static bool mips_cpu_has_work(CPUState *cs)
 {
     MIPSCPU *cpu = MIPS_CPU(cs);
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
     cc->has_work = mips_cpu_has_work;
     cc->dump_state = mips_cpu_dump_state;
     cc->set_pc = mips_cpu_set_pc;
+    cc->get_pc = mips_cpu_get_pc;
     cc->gdb_read_register = mips_cpu_gdb_read_register;
     cc->gdb_write_register = mips_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/nios2/cpu.c
+++ b/target/nios2/cpu.c
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_set_pc(CPUState *cs, vaddr value)
     env->pc = value;
 }
 
+static vaddr nios2_cpu_get_pc(CPUState *cs)
+{
+    Nios2CPU *cpu = NIOS2_CPU(cs);
+    CPUNios2State *env = &cpu->env;
+
+    return env->pc;
+}
+
 static bool nios2_cpu_has_work(CPUState *cs)
 {
     return cs->interrupt_request & CPU_INTERRUPT_HARD;
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
     cc->has_work = nios2_cpu_has_work;
     cc->dump_state = nios2_cpu_dump_state;
     cc->set_pc = nios2_cpu_set_pc;
+    cc->get_pc = nios2_cpu_get_pc;
     cc->disas_set_info = nios2_cpu_disas_set_info;
 #ifndef CONFIG_USER_ONLY
     cc->sysemu_ops = &nios2_sysemu_ops;
diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/cpu.c
+++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.dflag = 0;
 }
 
+static vaddr openrisc_cpu_get_pc(CPUState *cs)
+{
+    OpenRISCCPU *cpu = OPENRISC_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static void openrisc_cpu_synchronize_from_tb(CPUState *cs,
                                              const TranslationBlock *tb)
 {
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
     cc->has_work = openrisc_cpu_has_work;
     cc->dump_state = openrisc_cpu_dump_state;
     cc->set_pc = openrisc_cpu_set_pc;
+    cc->get_pc = openrisc_cpu_get_pc;
     cc->gdb_read_register = openrisc_cpu_gdb_read_register;
     cc->gdb_write_register = openrisc_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.nip = value;
 }
 
+static vaddr ppc_cpu_get_pc(CPUState *cs)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+    return cpu->env.nip;
+}
+
 static bool ppc_cpu_has_work(CPUState *cs)
 {
     PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
     cc->has_work = ppc_cpu_has_work;
     cc->dump_state = ppc_cpu_dump_state;
     cc->set_pc = ppc_cpu_set_pc;
+    cc->get_pc = ppc_cpu_get_pc;
     cc->gdb_read_register = ppc_cpu_gdb_read_register;
     cc->gdb_write_register = ppc_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_set_pc(CPUState *cs, vaddr value)
     }
 }
 
+static vaddr riscv_cpu_get_pc(CPUState *cs)
+{
+    RISCVCPU *cpu = RISCV_CPU(cs);
+    CPURISCVState *env = &cpu->env;
+
+    /* Match cpu_get_tb_cpu_state. */
+    if (env->xl == MXL_RV32) {
+        return env->pc & UINT32_MAX;
+    }
+    return env->pc;
+}
+
 static void riscv_cpu_synchronize_from_tb(CPUState *cs,
                                           const TranslationBlock *tb)
 {
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
     cc->has_work = riscv_cpu_has_work;
     cc->dump_state = riscv_cpu_dump_state;
     cc->set_pc = riscv_cpu_set_pc;
+    cc->get_pc = riscv_cpu_get_pc;
     cc->gdb_read_register = riscv_cpu_gdb_read_register;
     cc->gdb_write_register = riscv_cpu_gdb_write_register;
     cc->gdb_num_core_regs = 33;
diff --git a/target/rx/cpu.c b/target/rx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/cpu.c
+++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.pc = value;
 }
 
+static vaddr rx_cpu_get_pc(CPUState *cs)
+{
+    RXCPU *cpu = RX_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static void rx_cpu_synchronize_from_tb(CPUState *cs,
                                        const TranslationBlock *tb)
 {
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
     cc->has_work = rx_cpu_has_work;
     cc->dump_state = rx_cpu_dump_state;
     cc->set_pc = rx_cpu_set_pc;
+    cc->get_pc = rx_cpu_get_pc;
 
 #ifndef CONFIG_USER_ONLY
     cc->sysemu_ops = &rx_sysemu_ops;
diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/cpu.c
+++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.psw.addr = value;
 }
 
+static vaddr s390_cpu_get_pc(CPUState *cs)
+{
+    S390CPU *cpu = S390_CPU(cs);
+
+    return cpu->env.psw.addr;
+}
+
 static bool s390_cpu_has_work(CPUState *cs)
 {
     S390CPU *cpu = S390_CPU(cs);
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
     cc->has_work = s390_cpu_has_work;
     cc->dump_state = s390_cpu_dump_state;
     cc->set_pc = s390_cpu_set_pc;
+    cc->get_pc = s390_cpu_get_pc;
     cc->gdb_read_register = s390_cpu_gdb_read_register;
     cc->gdb_write_register = s390_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.pc = value;
 }
 
+static vaddr superh_cpu_get_pc(CPUState *cs)
+{
+    SuperHCPU *cpu = SUPERH_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static void superh_cpu_synchronize_from_tb(CPUState *cs,
                                            const TranslationBlock *tb)
 {
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
     cc->has_work = superh_cpu_has_work;
     cc->dump_state = superh_cpu_dump_state;
     cc->set_pc = superh_cpu_set_pc;
+    cc->get_pc = superh_cpu_get_pc;
     cc->gdb_read_register = superh_cpu_gdb_read_register;
     cc->gdb_write_register = superh_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.c
+++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.npc = value + 4;
 }
 
+static vaddr sparc_cpu_get_pc(CPUState *cs)
+{
+    SPARCCPU *cpu = SPARC_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static void sparc_cpu_synchronize_from_tb(CPUState *cs,
                                           const TranslationBlock *tb)
 {
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
     cc->memory_rw_debug = sparc_cpu_memory_rw_debug;
 #endif
     cc->set_pc = sparc_cpu_set_pc;
+    cc->get_pc = sparc_cpu_get_pc;
     cc->gdb_read_register = sparc_cpu_gdb_read_register;
     cc->gdb_write_register = sparc_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/cpu.c
+++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_set_pc(CPUState *cs, vaddr value)
     env->PC = value & ~(target_ulong)1;
 }
 
+static vaddr tricore_cpu_get_pc(CPUState *cs)
+{
+    TriCoreCPU *cpu = TRICORE_CPU(cs);
+    CPUTriCoreState *env = &cpu->env;
+
+    return env->PC;
+}
+
 static void tricore_cpu_synchronize_from_tb(CPUState *cs,
                                             const TranslationBlock *tb)
 {
@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_class_init(ObjectClass *c, void *data)
 
     cc->dump_state = tricore_cpu_dump_state;
     cc->set_pc = tricore_cpu_set_pc;
+    cc->get_pc = tricore_cpu_get_pc;
     cc->sysemu_ops = &tricore_sysemu_ops;
     cc->tcg_ops = &tricore_tcg_ops;
 }
diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/cpu.c
+++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.pc = value;
 }
 
+static vaddr xtensa_cpu_get_pc(CPUState *cs)
+{
+    XtensaCPU *cpu = XTENSA_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static bool xtensa_cpu_has_work(CPUState *cs)
 {
 #ifndef CONFIG_USER_ONLY
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
     cc->has_work = xtensa_cpu_has_work;
     cc->dump_state = xtensa_cpu_dump_state;
     cc->set_pc = xtensa_cpu_set_pc;
+    cc->get_pc = xtensa_cpu_get_pc;
     cc->gdb_read_register = xtensa_cpu_gdb_read_register;
     cc->gdb_write_register = xtensa_cpu_gdb_write_register;
     cc->gdb_stop_before_watchpoint = true;
-- 
2.34.1

The availability of tb->pc will shortly be conditional.
Introduce accessor functions to minimize ifdefs.

Pass around a known pc to places like tcg_gen_code,
where the caller must already have the value.

diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/internal.h
+++ b/accel/tcg/internal.h
@@ -XXX,XX +XXX,XX @@ G_NORETURN void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
 void page_init(void);
 void tb_htable_init(void);
 
+/* Return the current PC from CPU, which may be cached in TB. */
+static inline target_ulong log_pc(CPUState *cpu, const TranslationBlock *tb)
+{
+    return tb_pc(tb);
+}
+
 #endif /* ACCEL_TCG_INTERNAL_H */
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
     uintptr_t jmp_dest[2];
 };
 
+/* Hide the read to avoid ifdefs for TARGET_TB_PCREL. */
+static inline target_ulong tb_pc(const TranslationBlock *tb)
+{
+    return tb->pc;
+}
+
 /* Hide the qatomic_read to make code a little easier on the eyes */
 static inline uint32_t tb_cflags(const TranslationBlock *tb)
 {
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ void tcg_register_thread(void);
 void tcg_prologue_init(TCGContext *s);
 void tcg_func_start(TCGContext *s);
 
-int tcg_gen_code(TCGContext *s, TranslationBlock *tb);
+int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start);
 
 void tcg_set_frame(TCGContext *s, TCGReg reg, intptr_t start, intptr_t size);
 
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
     const TranslationBlock *tb = p;
     const struct tb_desc *desc = d;
 
-    if (tb->pc == desc->pc &&
+    if (tb_pc(tb) == desc->pc &&
         tb->page_addr[0] == desc->page_addr0 &&
         tb->cs_base == desc->cs_base &&
         tb->flags == desc->flags &&
@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
     return tb;
 }
 
-static inline void log_cpu_exec(target_ulong pc, CPUState *cpu,
-                                const TranslationBlock *tb)
+static void log_cpu_exec(target_ulong pc, CPUState *cpu,
+                         const TranslationBlock *tb)
 {
-    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_CPU | CPU_LOG_EXEC))
-        && qemu_log_in_addr_range(pc)) {
-
+    if (qemu_log_in_addr_range(pc)) {
         qemu_log_mask(CPU_LOG_EXEC,
                       "Trace %d: %p [" TARGET_FMT_lx
                       "/" TARGET_FMT_lx "/%08x/%08x] %s\n",
@@ -XXX,XX +XXX,XX @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
         return tcg_code_gen_epilogue;
     }
 
-    log_cpu_exec(pc, cpu, tb);
+    if (qemu_loglevel_mask(CPU_LOG_TB_CPU | CPU_LOG_EXEC)) {
+        log_cpu_exec(pc, cpu, tb);
+    }
 
     return tb->tc.ptr;
 }
@@ -XXX,XX +XXX,XX @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
     TranslationBlock *last_tb;
     const void *tb_ptr = itb->tc.ptr;
 
-    log_cpu_exec(itb->pc, cpu, itb);
+    if (qemu_loglevel_mask(CPU_LOG_TB_CPU | CPU_LOG_EXEC)) {
+        log_cpu_exec(log_pc(cpu, itb), cpu, itb);
+    }
 
     qemu_thread_jit_execute();
     ret = tcg_qemu_tb_exec(env, tb_ptr);
@@ -XXX,XX +XXX,XX @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
          * of the start of the TB.
          */
         CPUClass *cc = CPU_GET_CLASS(cpu);
-        qemu_log_mask_and_addr(CPU_LOG_EXEC, last_tb->pc,
-                               "Stopped execution of TB chain before %p ["
-                               TARGET_FMT_lx "] %s\n",
-                               last_tb->tc.ptr, last_tb->pc,
-                               lookup_symbol(last_tb->pc));
+
         if (cc->tcg_ops->synchronize_from_tb) {
             cc->tcg_ops->synchronize_from_tb(cpu, last_tb);
         } else {
             assert(cc->set_pc);
-            cc->set_pc(cpu, last_tb->pc);
+            cc->set_pc(cpu, tb_pc(last_tb));
+        }
+        if (qemu_loglevel_mask(CPU_LOG_EXEC)) {
+            target_ulong pc = log_pc(cpu, last_tb);
+            if (qemu_log_in_addr_range(pc)) {
+                qemu_log("Stopped execution of TB chain before %p ["
+                         TARGET_FMT_lx "] %s\n",
+                         last_tb->tc.ptr, pc, lookup_symbol(pc));
+            }
         }
     }
 
@@ -XXX,XX +XXX,XX @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
 
     qemu_spin_unlock(&tb_next->jmp_lock);
 
-    qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,
-                           "Linking TBs %p [" TARGET_FMT_lx
-                           "] index %d -> %p [" TARGET_FMT_lx "]\n",
-                           tb->tc.ptr, tb->pc, n,
-                           tb_next->tc.ptr, tb_next->pc);
+    qemu_log_mask(CPU_LOG_EXEC, "Linking TBs %p index %d -> %p\n",
+                  tb->tc.ptr, n, tb_next->tc.ptr);
     return;
 
  out_unlock_next:
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_interrupt(CPUState *cpu,
 }
 
 static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb,
+                                    target_ulong pc,
                                     TranslationBlock **last_tb, int *tb_exit)
 {
     int32_t insns_left;
 
-    trace_exec_tb(tb, tb->pc);
+    trace_exec_tb(tb, pc);
     tb = cpu_tb_exec(cpu, tb, tb_exit);
     if (*tb_exit != TB_EXIT_REQUESTED) {
         *last_tb = tb;
@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
                 tb_add_jump(last_tb, tb_exit, tb);
             }
 
-            cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit);
+            cpu_loop_exec_tb(cpu, tb, pc, &last_tb, &tb_exit);
 
             /* Try to align the host and virtual clocks
                if the guest is in advance */
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
 
         for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
             if (i == 0) {
-                prev = (j == 0 ? tb->pc : 0);
+                prev = (j == 0 ? tb_pc(tb) : 0);
             } else {
                 prev = tcg_ctx->gen_insn_data[i - 1][j];
             }
@@ -XXX,XX +XXX,XX @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
 static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
                                      uintptr_t searched_pc, bool reset_icount)
 {
-    target_ulong data[TARGET_INSN_START_WORDS] = { tb->pc };
+    target_ulong data[TARGET_INSN_START_WORDS] = { tb_pc(tb) };
     uintptr_t host_pc = (uintptr_t)tb->tc.ptr;
     CPUArchState *env = cpu->env_ptr;
     const uint8_t *p = tb->tc.ptr + tb->tc.size;
@@ -XXX,XX +XXX,XX @@ static bool tb_cmp(const void *ap, const void *bp)
     const TranslationBlock *a = ap;
     const TranslationBlock *b = bp;
 
-    return a->pc == b->pc &&
+    return tb_pc(a) == tb_pc(b) &&
         a->cs_base == b->cs_base &&
         a->flags == b->flags &&
         (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
@@ -XXX,XX +XXX,XX @@ static void do_tb_invalidate_check(void *p, uint32_t hash, void *userp)
     TranslationBlock *tb = p;
     target_ulong addr = *(target_ulong *)userp;
 
-    if (!(addr + TARGET_PAGE_SIZE <= tb->pc || addr >= tb->pc + tb->size)) {
+    if (!(addr + TARGET_PAGE_SIZE <= tb_pc(tb) ||
+          addr >= tb_pc(tb) + tb->size)) {
         printf("ERROR invalidate: address=" TARGET_FMT_lx
-               " PC=%08lx size=%04x\n", addr, (long)tb->pc, tb->size);
+               " PC=%08lx size=%04x\n", addr, (long)tb_pc(tb), tb->size);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void do_tb_page_check(void *p, uint32_t hash, void *userp)
     TranslationBlock *tb = p;
     int flags1, flags2;
 
-    flags1 = page_get_flags(tb->pc);
-    flags2 = page_get_flags(tb->pc + tb->size - 1);
+    flags1 = page_get_flags(tb_pc(tb));
+    flags2 = page_get_flags(tb_pc(tb) + tb->size - 1);
     if ((flags1 & PAGE_WRITE) || (flags2 & PAGE_WRITE)) {
         printf("ERROR page flags: PC=%08lx size=%04x f1=%x f2=%x\n",
-               (long)tb->pc, tb->size, flags1, flags2);
+               (long)tb_pc(tb), tb->size, flags1, flags2);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
 
     /* remove the TB from the hash list */
     phys_pc = tb->page_addr[0];
-    h = tb_hash_func(phys_pc, tb->pc, tb->flags, orig_cflags,
+    h = tb_hash_func(phys_pc, tb_pc(tb), tb->flags, orig_cflags,
                      tb->trace_vcpu_dstate);
     if (!qht_remove(&tb_ctx.htable, tb, h)) {
         return;
@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
     }
 
     /* add in the hash table */
-    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags,
+    h = tb_hash_func(phys_pc, tb_pc(tb), tb->flags, tb->cflags,
                      tb->trace_vcpu_dstate);
     qht_insert(&tb_ctx.htable, tb, h, &existing_tb);
 
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     tcg_ctx->cpu = NULL;
     max_insns = tb->icount;
 
-    trace_translate_block(tb, tb->pc, tb->tc.ptr);
+    trace_translate_block(tb, pc, tb->tc.ptr);
 
     /* generate machine code */
     tb->jmp_reset_offset[0] = TB_JMP_RESET_OFFSET_INVALID;
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     ti = profile_getclock();
 #endif
 
-    gen_code_size = tcg_gen_code(tcg_ctx, tb);
+    gen_code_size = tcg_gen_code(tcg_ctx, tb, pc);
     if (unlikely(gen_code_size < 0)) {
  error_return:
         switch (gen_code_size) {
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
-        qemu_log_in_addr_range(tb->pc)) {
+        qemu_log_in_addr_range(pc)) {
         FILE *logfile = qemu_log_trylock();
         if (logfile) {
             int code_size, data_size;
@@ -XXX,XX +XXX,XX @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
      */
     cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_LAST_IO | n;
 
-    qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,
-                           "cpu_io_recompile: rewound execution of TB to "
-                           TARGET_FMT_lx "\n", tb->pc);
+    if (qemu_loglevel_mask(CPU_LOG_EXEC)) {
+        target_ulong pc = log_pc(cpu, tb);
+        if (qemu_log_in_addr_range(pc)) {
+            qemu_log("cpu_io_recompile: rewound execution of TB to "
+                     TARGET_FMT_lx "\n", pc);
+        }
+    }
 
     cpu_loop_exit_noexc(cpu);
 }
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ void arm_cpu_synchronize_from_tb(CPUState *cs,
      * never possible for an AArch64 TB to chain to an AArch32 TB.
      */
     if (is_a64(env)) {
-        env->pc = tb->pc;
+        env->pc = tb_pc(tb);
     } else {
-        env->regs[15] = tb->pc;
+        env->regs[15] = tb_pc(tb);
     }
 }
 #endif /* CONFIG_TCG */
diff --git a/target/avr/cpu.c b/target/avr/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/cpu.c
+++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_synchronize_from_tb(CPUState *cs,
     AVRCPU *cpu = AVR_CPU(cs);
     CPUAVRState *env = &cpu->env;
 
-    env->pc_w = tb->pc / 2; /* internally PC points to words */
+    env->pc_w = tb_pc(tb) / 2; /* internally PC points to words */
 }
 
 static void avr_cpu_reset(DeviceState *ds)
diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/cpu.c
+++ b/target/hexagon/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hexagon_cpu_synchronize_from_tb(CPUState *cs,
 {
     HexagonCPU *cpu = HEXAGON_CPU(cs);
     CPUHexagonState *env = &cpu->env;
-    env->gpr[HEX_REG_PC] = tb->pc;
+    env->gpr[HEX_REG_PC] = tb_pc(tb);
 }
 
 static bool hexagon_cpu_has_work(CPUState *cs)
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_synchronize_from_tb(CPUState *cs,
     HPPACPU *cpu = HPPA_CPU(cs);
 
 #ifdef CONFIG_USER_ONLY
-    cpu->env.iaoq_f = tb->pc;
+    cpu->env.iaoq_f = tb_pc(tb);
     cpu->env.iaoq_b = tb->cs_base;
 #else
     /* Recover the IAOQ values from the GVA + PRIV.  */
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_synchronize_from_tb(CPUState *cs,
     int32_t diff = cs_base;
 
     cpu->env.iasq_f = iasq_f;
-    cpu->env.iaoq_f = (tb->pc & ~iasq_f) + priv;
+    cpu->env.iaoq_f = (tb_pc(tb) & ~iasq_f) + priv;
     if (diff) {
         cpu->env.iaoq_b = cpu->env.iaoq_f + diff;
     }
diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static void x86_cpu_synchronize_from_tb(CPUState *cs,
 {
     X86CPU *cpu = X86_CPU(cs);
 
-    cpu->env.eip = tb->pc - tb->cs_base;
+    cpu->env.eip = tb_pc(tb) - tb->cs_base;
 }
 
 #ifndef CONFIG_USER_ONLY
diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/cpu.c
+++ b/target/loongarch/cpu.c
@@ -XXX,XX +XXX,XX @@ static void loongarch_cpu_synchronize_from_tb(CPUState *cs,
     LoongArchCPU *cpu = LOONGARCH_CPU(cs);
     CPULoongArchState *env = &cpu->env;
 
-    env->pc = tb->pc;
+    env->pc = tb_pc(tb);
 }
 #endif /* CONFIG_TCG */
 
diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_synchronize_from_tb(CPUState *cs,
 {
     MicroBlazeCPU *cpu = MICROBLAZE_CPU(cs);
 
-    cpu->env.pc = tb->pc;
+    cpu->env.pc = tb_pc(tb);
     cpu->env.iflags = tb->flags & IFLAGS_TB_MASK;
 }
 
diff --git a/target/mips/tcg/exception.c b/target/mips/tcg/exception.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/exception.c
+++ b/target/mips/tcg/exception.c
@@ -XXX,XX +XXX,XX @@ void mips_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb)
     MIPSCPU *cpu = MIPS_CPU(cs);
     CPUMIPSState *env = &cpu->env;
 
-    env->active_tc.PC = tb->pc;
+    env->active_tc.PC = tb_pc(tb);
     env->hflags &= ~MIPS_HFLAG_BMASK;
     env->hflags |= tb->flags & MIPS_HFLAG_BMASK;
 }
diff --git a/target/mips/tcg/sysemu/special_helper.c b/target/mips/tcg/sysemu/special_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/sysemu/special_helper.c
+++ b/target/mips/tcg/sysemu/special_helper.c
@@ -XXX,XX +XXX,XX @@ bool mips_io_recompile_replay_branch(CPUState *cs, const TranslationBlock *tb)
     CPUMIPSState *env = &cpu->env;
 
     if ((env->hflags & MIPS_HFLAG_BMASK) != 0
-        && env->active_tc.PC != tb->pc) {
+        && env->active_tc.PC != tb_pc(tb)) {
         env->active_tc.PC -= (env->hflags & MIPS_HFLAG_B16 ? 2 : 4);
         env->hflags &= ~MIPS_HFLAG_BMASK;
         return true;
diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/cpu.c
+++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_synchronize_from_tb(CPUState *cs,
 {
     OpenRISCCPU *cpu = OPENRISC_CPU(cs);
 
-    cpu->env.pc = tb->pc;
+    cpu->env.pc = tb_pc(tb);
 }
 
 
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_synchronize_from_tb(CPUState *cs,
     RISCVMXL xl = FIELD_EX32(tb->flags, TB_FLAGS, XL);
 
     if (xl == MXL_RV32) {
-        env->pc = (int32_t)tb->pc;
+        env->pc = (int32_t)tb_pc(tb);
     } else {
-        env->pc = tb->pc;
+        env->pc = tb_pc(tb);
     }
 }
 
diff --git a/target/rx/cpu.c b/target/rx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/cpu.c
+++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_synchronize_from_tb(CPUState *cs,
 {
     RXCPU *cpu = RX_CPU(cs);
 
-    cpu->env.pc = tb->pc;
+    cpu->env.pc = tb_pc(tb);
 }
 
 static bool rx_cpu_has_work(CPUState *cs)
diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_synchronize_from_tb(CPUState *cs,
 {
     SuperHCPU *cpu = SUPERH_CPU(cs);
 
-    cpu->env.pc = tb->pc;
+    cpu->env.pc = tb_pc(tb);
     cpu->env.flags = tb->flags & TB_FLAG_ENVFLAGS_MASK;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool superh_io_recompile_replay_branch(CPUState *cs,
     CPUSH4State *env = &cpu->env;
 
     if ((env->flags & ((DELAY_SLOT | DELAY_SLOT_CONDITIONAL))) != 0
-        && env->pc != tb->pc) {
+        && env->pc != tb_pc(tb)) {
         env->pc -= 2;
         env->flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
         return true;
diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.c
+++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_synchronize_from_tb(CPUState *cs,
 {
     SPARCCPU *cpu = SPARC_CPU(cs);
 
-    cpu->env.pc = tb->pc;
+    cpu->env.pc = tb_pc(tb);
     cpu->env.npc = tb->cs_base;
 }
 
diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/cpu.c
+++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_synchronize_from_tb(CPUState *cs,
     TriCoreCPU *cpu = TRICORE_CPU(cs);
     CPUTriCoreState *env = &cpu->env;
 
-    env->PC = tb->pc;
+    env->PC = tb_pc(tb);
 }
 
 static void tricore_cpu_reset(DeviceState *dev)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ int64_t tcg_cpu_exec_time(void)
 #endif
 
 
-int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
+int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
 {
 #ifdef CONFIG_PROFILER
     TCGProfile *prof = &s->prof;
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
 
 #ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
-                 && qemu_log_in_addr_range(tb->pc))) {
+                 && qemu_log_in_addr_range(pc_start))) {
         FILE *logfile = qemu_log_trylock();
         if (logfile) {
             fprintf(logfile, "OP:\n");
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
     if (s->nb_indirects > 0) {
 #ifdef DEBUG_DISAS
         if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_IND)
-                     && qemu_log_in_addr_range(tb->pc))) {
+                     && qemu_log_in_addr_range(pc_start))) {
             FILE *logfile = qemu_log_trylock();
             if (logfile) {
                 fprintf(logfile, "OP before indirect lowering:\n");
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
 
 #ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT)
-                 && qemu_log_in_addr_range(tb->pc))) {
+                 && qemu_log_in_addr_range(pc_start))) {
         FILE *logfile = qemu_log_trylock();
         if (logfile) {
             fprintf(logfile, "OP after optimization and liveness analysis:\n");
-- 
2.34.1

Prepare for targets to be able to produce TBs that can
run in more than one virtual context.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/internal.h      |  4 +++
 accel/tcg/tb-jmp-cache.h  | 41 +++++++++++++++++++++++++
 include/exec/cpu-defs.h   |  3 ++
 include/exec/exec-all.h   | 32 ++++++++++++++++++--
 accel/tcg/cpu-exec.c      | 16 ++++++----
 accel/tcg/translate-all.c | 64 ++++++++++++++++++++++++++-------------
 6 files changed, 131 insertions(+), 29 deletions(-)

diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/internal.h
+++ b/accel/tcg/internal.h
@@ -XXX,XX +XXX,XX @@ void tb_htable_init(void);
 /* Return the current PC from CPU, which may be cached in TB. */
 static inline target_ulong log_pc(CPUState *cpu, const TranslationBlock *tb)
 {
+#if TARGET_TB_PCREL
+    return cpu->cc->get_pc(cpu);
+#else
     return tb_pc(tb);
+#endif
 }
 
 #endif /* ACCEL_TCG_INTERNAL_H */
diff --git a/accel/tcg/tb-jmp-cache.h b/accel/tcg/tb-jmp-cache.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tb-jmp-cache.h
+++ b/accel/tcg/tb-jmp-cache.h
@@ -XXX,XX +XXX,XX @@
 
 /*
  * Accessed in parallel; all accesses to 'tb' must be atomic.
+ * For TARGET_TB_PCREL, accesses to 'pc' must be protected by
+ * a load_acquire/store_release to 'tb'.
  */
 struct CPUJumpCache {
     struct {
         TranslationBlock *tb;
+#if TARGET_TB_PCREL
+        target_ulong pc;
+#endif
     } array[TB_JMP_CACHE_SIZE];
 };
 
+static inline TranslationBlock *
+tb_jmp_cache_get_tb(CPUJumpCache *jc, uint32_t hash)
+{
+#if TARGET_TB_PCREL
+    /* Use acquire to ensure current load of pc from jc. */
+    return qatomic_load_acquire(&jc->array[hash].tb);
+#else
+    /* Use rcu_read to ensure current load of pc from *tb. */
+    return qatomic_rcu_read(&jc->array[hash].tb);
+#endif
+}
+
+static inline target_ulong
+tb_jmp_cache_get_pc(CPUJumpCache *jc, uint32_t hash, TranslationBlock *tb)
+{
+#if TARGET_TB_PCREL
+    return jc->array[hash].pc;
+#else
+    return tb_pc(tb);
+#endif
+}
+
+static inline void
+tb_jmp_cache_set(CPUJumpCache *jc, uint32_t hash,
+                 TranslationBlock *tb, target_ulong pc)
+{
+#if TARGET_TB_PCREL
+    jc->array[hash].pc = pc;
+    /* Use store_release on tb to ensure pc is written first. */
+    qatomic_store_release(&jc->array[hash].tb, tb);
+#else
+    /* Use the pc value already stored in tb->pc. */
+    qatomic_set(&jc->array[hash].tb, tb);
+#endif
+}
+
 #endif /* ACCEL_TCG_TB_JMP_CACHE_H */
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -XXX,XX +XXX,XX @@
 #  error TARGET_PAGE_BITS must be defined in cpu-param.h
 # endif
 #endif
+#ifndef TARGET_TB_PCREL
+# define TARGET_TB_PCREL 0
+#endif
 
 #define TARGET_LONG_SIZE (TARGET_LONG_BITS / 8)
 
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ struct tb_tc {
 };
 
 struct TranslationBlock {
-    target_ulong pc;   /* simulated PC corresponding to this block (EIP + CS base) */
-    target_ulong cs_base; /* CS base for this block */
+#if !TARGET_TB_PCREL
+    /*
+     * Guest PC corresponding to this block.  This must be the true
+     * virtual address.  Therefore e.g. x86 stores EIP + CS_BASE, and
+     * targets like Arm, MIPS, HP-PA, which reuse low bits for ISA or
+     * privilege, must store those bits elsewhere.
+     *
+     * If TARGET_TB_PCREL, the opcodes for the TranslationBlock are
+     * written such that the TB is associated only with the physical
+     * page and may be run in any virtual address context.  In this case,
+     * PC must always be taken from ENV in a target-specific manner.
+     * Unwind information is taken as offsets from the page, to be
+     * deposited into the "current" PC.
+     */
+    target_ulong pc;
+#endif
+
+    /*
+     * Target-specific data associated with the TranslationBlock, e.g.:
+     * x86: the original user, the Code Segment virtual base,
+     * arm: an extension of tb->flags,
+     * s390x: instruction data for EXECUTE,
+     * sparc: the next pc of the instruction queue (for delay slots).
+     */
+    target_ulong cs_base;
+
     uint32_t flags; /* flags defining in which context the code was generated */
     uint32_t cflags;    /* compile flags */
 
@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
 /* Hide the read to avoid ifdefs for TARGET_TB_PCREL. */
 static inline target_ulong tb_pc(const TranslationBlock *tb)
 {
+#if TARGET_TB_PCREL
+    qemu_build_not_reached();
+#else
     return tb->pc;
+#endif
 }
 
 /* Hide the qatomic_read to make code a little easier on the eyes */
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
     const TranslationBlock *tb = p;
     const struct tb_desc *desc = d;
 
-    if (tb_pc(tb) == desc->pc &&
+    if ((TARGET_TB_PCREL || tb_pc(tb) == desc->pc) &&
         tb->page_addr[0] == desc->page_addr0 &&
         tb->cs_base == desc->cs_base &&
         tb->flags == desc->flags &&
@@ -XXX,XX +XXX,XX @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
         return NULL;
     }
     desc.page_addr0 = phys_pc;
-    h = tb_hash_func(phys_pc, pc, flags, cflags, *cpu->trace_dstate);
+    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : pc),
+                     flags, cflags, *cpu->trace_dstate);
     return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);
 }
 
@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
                                           uint32_t flags, uint32_t cflags)
 {
     TranslationBlock *tb;
+    CPUJumpCache *jc;
     uint32_t hash;
 
     /* we should never be trying to look up an INVALID tb */
     tcg_debug_assert(!(cflags & CF_INVALID));
 
     hash = tb_jmp_cache_hash_func(pc);
-    tb = qatomic_rcu_read(&cpu->tb_jmp_cache->array[hash].tb);
+    jc = cpu->tb_jmp_cache;
+    tb = tb_jmp_cache_get_tb(jc, hash);
 
     if (likely(tb &&
-               tb->pc == pc &&
+               tb_jmp_cache_get_pc(jc, hash, tb) == pc &&
                tb->cs_base == cs_base &&
                tb->flags == flags &&
                tb->trace_vcpu_dstate == *cpu->trace_dstate &&
@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
     if (tb == NULL) {
         return NULL;
     }
-    qatomic_set(&cpu->tb_jmp_cache->array[hash].tb, tb);
+    tb_jmp_cache_set(jc, hash, tb, pc);
     return tb;
 }
 
@@ -XXX,XX +XXX,XX @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
         if (cc->tcg_ops->synchronize_from_tb) {
             cc->tcg_ops->synchronize_from_tb(cpu, last_tb);
         } else {
+            assert(!TARGET_TB_PCREL);
             assert(cc->set_pc);
             cc->set_pc(cpu, tb_pc(last_tb));
         }
@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
                  * for the fast lookup
                  */
                 h = tb_jmp_cache_hash_func(pc);
-                qatomic_set(&cpu->tb_jmp_cache->array[h].tb, tb);
+                tb_jmp_cache_set(cpu->tb_jmp_cache, h, tb, pc);
             }
 
 #ifndef CONFIG_USER_ONLY
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
 
         for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
             if (i == 0) {
-                prev = (j == 0 ? tb_pc(tb) : 0);
+                prev = (!TARGET_TB_PCREL && j == 0 ? tb_pc(tb) : 0);
             } else {
                 prev = tcg_ctx->gen_insn_data[i - 1][j];
             }
@@ -XXX,XX +XXX,XX @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
 static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
                                      uintptr_t searched_pc, bool reset_icount)
 {
-    target_ulong data[TARGET_INSN_START_WORDS] = { tb_pc(tb) };
+    target_ulong data[TARGET_INSN_START_WORDS];
     uintptr_t host_pc = (uintptr_t)tb->tc.ptr;
     CPUArchState *env = cpu->env_ptr;
     const uint8_t *p = tb->tc.ptr + tb->tc.size;
@@ -XXX,XX +XXX,XX @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
         return -1;
     }
 
+    memset(data, 0, sizeof(data));
+    if (!TARGET_TB_PCREL) {
+        data[0] = tb_pc(tb);
+    }
+
     /* Reconstruct the stored insn data while looking for the point at
        which the end of the insn exceeds the searched_pc.  */
     for (i = 0; i < num_insns; ++i) {
@@ -XXX,XX +XXX,XX @@ static bool tb_cmp(const void *ap, const void *bp)
     const TranslationBlock *a = ap;
     const TranslationBlock *b = bp;
 
-    return tb_pc(a) == tb_pc(b) &&
-        a->cs_base == b->cs_base &&
-        a->flags == b->flags &&
-        (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
-        a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
-        a->page_addr[0] == b->page_addr[0] &&
-        a->page_addr[1] == b->page_addr[1];
+    return ((TARGET_TB_PCREL || tb_pc(a) == tb_pc(b)) &&
+            a->cs_base == b->cs_base &&
+            a->flags == b->flags &&
+            (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
+            a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
+            a->page_addr[0] == b->page_addr[0] &&
+            a->page_addr[1] == b->page_addr[1]);
 }
 
 void tb_htable_init(void)
@@ -XXX,XX +XXX,XX @@ static inline void tb_jmp_unlink(TranslationBlock *dest)
     qemu_spin_unlock(&dest->jmp_lock);
 }
 
+static void tb_jmp_cache_inval_tb(TranslationBlock *tb)
+{
+    CPUState *cpu;
+
+    if (TARGET_TB_PCREL) {
+        /* A TB may be at any virtual address */
+        CPU_FOREACH(cpu) {
+            tcg_flush_jmp_cache(cpu);
+        }
+    } else {
+        uint32_t h = tb_jmp_cache_hash_func(tb_pc(tb));
+
+        CPU_FOREACH(cpu) {
+            CPUJumpCache *jc = cpu->tb_jmp_cache;
+
+            if (qatomic_read(&jc->array[h].tb) == tb) {
+                qatomic_set(&jc->array[h].tb, NULL);
+            }
+        }
+    }
+}
+
 /*
  * In user-mode, call with mmap_lock held.
  * In !user-mode, if @rm_from_page_list is set, call with the TB's pages'
@@ -XXX,XX +XXX,XX @@ static inline void tb_jmp_unlink(TranslationBlock *dest)
  */
 static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
 {
-    CPUState *cpu;
     PageDesc *p;
     uint32_t h;
     tb_page_addr_t phys_pc;
@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
 
     /* remove the TB from the hash list */
     phys_pc = tb->page_addr[0];
-    h = tb_hash_func(phys_pc, tb_pc(tb), tb->flags, orig_cflags,
-                     tb->trace_vcpu_dstate);
+    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
+                     tb->flags, orig_cflags, tb->trace_vcpu_dstate);
     if (!qht_remove(&tb_ctx.htable, tb, h)) {
         return;
     }
@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
     }
 
     /* remove the TB from the hash list */
-    h = tb_jmp_cache_hash_func(tb->pc);
-    CPU_FOREACH(cpu) {
-        CPUJumpCache *jc = cpu->tb_jmp_cache;
-        if (qatomic_read(&jc->array[h].tb) == tb) {
-            qatomic_set(&jc->array[h].tb, NULL);
-        }
-    }
+    tb_jmp_cache_inval_tb(tb);
 
     /* suppress this TB from the two jump lists */
     tb_remove_from_jmp_list(tb, 0);
@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
     }
 
     /* add in the hash table */
-    h = tb_hash_func(phys_pc, tb_pc(tb), tb->flags, tb->cflags,
-                     tb->trace_vcpu_dstate);
+    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
+                     tb->flags, tb->cflags, tb->trace_vcpu_dstate);
     qht_insert(&tb_ctx.htable, tb, h, &existing_tb);
 
     /* remove TB from the page(s) if we couldn't insert it */
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
 
     gen_code_buf = tcg_ctx->code_gen_ptr;
     tb->tc.ptr = tcg_splitwx_to_rx(gen_code_buf);
+#if !TARGET_TB_PCREL
     tb->pc = pc;
+#endif
     tb->cs_base = cs_base;
     tb->flags = flags;
     tb->cflags = cflags;
-- 
2.34.1

From: Leandro Lupori <leandro.lupori@eldorado.org.br>

PowerPC64 processors handle direct branches better than indirect
ones, resulting in less stalled cycles and branch misses.

However, PPC's tb_target_set_jmp_target() was only using direct
branches for 16-bit jumps, while PowerPC64's unconditional branch
instructions are able to handle displacements of up to 26 bits.
To take advantage of this, now jumps whose displacements fit in
between 17 and 26 bits are also converted to direct branches.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Leandro Lupori <leandro.lupori@eldorado.org.br>
[rth: Expanded some commentary.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 119 +++++++++++++++++++++++++++++----------
 1 file changed, 88 insertions(+), 31 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
     tcg_out32(s, insn);
 }
 
+static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2)
+{
+    if (HOST_BIG_ENDIAN) {
+        return (uint64_t)i1 << 32 | i2;
+    }
+    return (uint64_t)i2 << 32 | i1;
+}
+
+static inline void ppc64_replace2(uintptr_t rx, uintptr_t rw,
+                                  tcg_insn_unit i0, tcg_insn_unit i1)
+{
+#if TCG_TARGET_REG_BITS == 64
+    qatomic_set((uint64_t *)rw, make_pair(i0, i1));
+    flush_idcache_range(rx, rw, 8);
+#else
+    qemu_build_not_reached();
+#endif
+}
+
+static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
+                                  tcg_insn_unit i0, tcg_insn_unit i1,
+                                  tcg_insn_unit i2, tcg_insn_unit i3)
+{
+    uint64_t p[2];
+
+    p[!HOST_BIG_ENDIAN] = make_pair(i0, i1);
+    p[HOST_BIG_ENDIAN] = make_pair(i2, i3);
+
+    /*
+     * There's no convenient way to get the compiler to allocate a pair
+     * of registers at an even index, so copy into r6/r7 and clobber.
+     */
+    asm("mr  %%r6, %1\n\t"
+        "mr  %%r7, %2\n\t"
+        "stq %%r6, %0"
+        : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7");
+    flush_idcache_range(rx, rw, 16);
+}
+
 void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
                               uintptr_t jmp_rw, uintptr_t addr)
 {
-    if (TCG_TARGET_REG_BITS == 64) {
-        tcg_insn_unit i1, i2;
-        intptr_t tb_diff = addr - tc_ptr;
-        intptr_t br_diff = addr - (jmp_rx + 4);
-        uint64_t pair;
+    tcg_insn_unit i0, i1, i2, i3;
+    intptr_t tb_diff = addr - tc_ptr;
+    intptr_t br_diff = addr - (jmp_rx + 4);
+    intptr_t lo, hi;
 
-        /* This does not exercise the range of the branch, but we do
-           still need to be able to load the new value of TCG_REG_TB.
-           But this does still happen quite often.  */
-        if (tb_diff == (int16_t)tb_diff) {
-            i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
-            i2 = B | (br_diff & 0x3fffffc);
-        } else {
-            intptr_t lo = (int16_t)tb_diff;
-            intptr_t hi = (int32_t)(tb_diff - lo);
-            assert(tb_diff == hi + lo);
-            i1 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
-            i2 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
-        }
-#if HOST_BIG_ENDIAN
-        pair = (uint64_t)i1 << 32 | i2;
-#else
-        pair = (uint64_t)i2 << 32 | i1;
-#endif
-
-        /* As per the enclosing if, this is ppc64.  Avoid the _Static_assert
-           within qatomic_set that would fail to build a ppc32 host.  */
-        qatomic_set__nocheck((uint64_t *)jmp_rw, pair);
-        flush_idcache_range(jmp_rx, jmp_rw, 8);
-    } else {
+    if (TCG_TARGET_REG_BITS == 32) {
         intptr_t diff = addr - jmp_rx;
         tcg_debug_assert(in_range_b(diff));
         qatomic_set((uint32_t *)jmp_rw, B | (diff & 0x3fffffc));
         flush_idcache_range(jmp_rx, jmp_rw, 4);
+        return;
     }
+
+    /*
+     * For 16-bit displacements, we can use a single add + branch.
+     * This happens quite often.
+     */
+    if (tb_diff == (int16_t)tb_diff) {
+        i0 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
+        i1 = B | (br_diff & 0x3fffffc);
+        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
+        return;
+    }
+
+    lo = (int16_t)tb_diff;
+    hi = (int32_t)(tb_diff - lo);
+    assert(tb_diff == hi + lo);
+    i0 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
+    i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
+
+    /*
+     * Without stq from 2.07, we can only update two insns,
+     * and those must be the ones that load the target address.
+     */
+    if (!have_isa_2_07) {
+        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
+        return;
+    }
+
+    /*
+     * For 26-bit displacements, we can use a direct branch.
+     * Otherwise we still need the indirect branch, which we
+     * must restore after a potential direct branch write.
+     */
+    br_diff -= 4;
+    if (in_range_b(br_diff)) {
+        i2 = B | (br_diff & 0x3fffffc);
+        i3 = NOP;
+    } else {
+        i2 = MTSPR | RS(TCG_REG_TB) | CTR;
+        i3 = BCCTR | BO_ALWAYS;
+    }
+    ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3);
 }
 
 static void tcg_out_call_int(TCGContext *s, int lk,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         if (s->tb_jmp_insn_offset) {
             /* Direct jump. */
             if (TCG_TARGET_REG_BITS == 64) {
-                /* Ensure the next insns are 8-byte aligned. */
-                if ((uintptr_t)s->code_ptr & 7) {
+                /* Ensure the next insns are 8 or 16-byte aligned. */
+                while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
                     tcg_out32(s, NOP);
                 }
                 s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
-- 
2.34.1

The value previously chosen overlaps GUSA_MASK.

Rename all DELAY_SLOT_* and GUSA_* defines to emphasize
that they are included in TB_FLAGs.  Add aliases for the
FPSCR and SR bits that are included in TB_FLAGS, so that
we don't accidentally reassign those bits.

Fixes: 4da06fb3062 ("target/sh4: Implement prctl_unalign_sigbus")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/856
Reviewed-by: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/sh4/cpu.h        | 56 +++++++++++++------------
 linux-user/sh4/signal.c |  6 +--
 target/sh4/cpu.c        |  6 +--
 target/sh4/helper.c     |  6 +--
 target/sh4/translate.c  | 90 ++++++++++++++++++++++-------------------
 5 files changed, 88 insertions(+), 76 deletions(-)

diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -XXX,XX +XXX,XX @@
 #define FPSCR_RM_NEAREST       (0 << 0)
 #define FPSCR_RM_ZERO          (1 << 0)
 
-#define DELAY_SLOT_MASK        0x7
-#define DELAY_SLOT             (1 << 0)
-#define DELAY_SLOT_CONDITIONAL (1 << 1)
-#define DELAY_SLOT_RTE         (1 << 2)
+#define TB_FLAG_DELAY_SLOT       (1 << 0)
+#define TB_FLAG_DELAY_SLOT_COND  (1 << 1)
+#define TB_FLAG_DELAY_SLOT_RTE   (1 << 2)
+#define TB_FLAG_PENDING_MOVCA    (1 << 3)
+#define TB_FLAG_GUSA_SHIFT       4                      /* [11:4] */
+#define TB_FLAG_GUSA_EXCLUSIVE   (1 << 12)
+#define TB_FLAG_UNALIGN          (1 << 13)
+#define TB_FLAG_SR_FD            (1 << SR_FD)           /* 15 */
+#define TB_FLAG_FPSCR_PR         FPSCR_PR               /* 19 */
+#define TB_FLAG_FPSCR_SZ         FPSCR_SZ               /* 20 */
+#define TB_FLAG_FPSCR_FR         FPSCR_FR               /* 21 */
+#define TB_FLAG_SR_RB            (1 << SR_RB)           /* 29 */
+#define TB_FLAG_SR_MD            (1 << SR_MD)           /* 30 */
 
-#define TB_FLAG_PENDING_MOVCA  (1 << 3)
-#define TB_FLAG_UNALIGN        (1 << 4)
-
-#define GUSA_SHIFT             4
-#ifdef CONFIG_USER_ONLY
-#define GUSA_EXCLUSIVE         (1 << 12)
-#define GUSA_MASK              ((0xff << GUSA_SHIFT) | GUSA_EXCLUSIVE)
-#else
-/* Provide dummy versions of the above to allow tests against tbflags
-   to be elided while avoiding ifdefs.  */
-#define GUSA_EXCLUSIVE         0
-#define GUSA_MASK              0
-#endif
-
-#define TB_FLAG_ENVFLAGS_MASK  (DELAY_SLOT_MASK | GUSA_MASK)
+#define TB_FLAG_DELAY_SLOT_MASK  (TB_FLAG_DELAY_SLOT |       \
+                                  TB_FLAG_DELAY_SLOT_COND |  \
+                                  TB_FLAG_DELAY_SLOT_RTE)
+#define TB_FLAG_GUSA_MASK        ((0xff << TB_FLAG_GUSA_SHIFT) | \
+                                  TB_FLAG_GUSA_EXCLUSIVE)
+#define TB_FLAG_FPSCR_MASK       (TB_FLAG_FPSCR_PR | \
+                                  TB_FLAG_FPSCR_SZ | \
+                                  TB_FLAG_FPSCR_FR)
+#define TB_FLAG_SR_MASK          (TB_FLAG_SR_FD | \
+                                  TB_FLAG_SR_RB | \
+                                  TB_FLAG_SR_MD)
+#define TB_FLAG_ENVFLAGS_MASK    (TB_FLAG_DELAY_SLOT_MASK | \
+                                  TB_FLAG_GUSA_MASK)
 
 typedef struct tlb_t {
     uint32_t vpn;		/* virtual page number */
@@ -XXX,XX +XXX,XX @@ static inline int cpu_mmu_index (CPUSH4State *env, bool ifetch)
 {
     /* The instruction in a RTE delay slot is fetched in privileged
        mode, but executed in user mode.  */
-    if (ifetch && (env->flags & DELAY_SLOT_RTE)) {
+    if (ifetch && (env->flags & TB_FLAG_DELAY_SLOT_RTE)) {
         return 0;
     } else {
         return (env->sr & (1u << SR_MD)) == 0 ? 1 : 0;
@@ -XXX,XX +XXX,XX @@ static inline void cpu_get_tb_cpu_state(CPUSH4State *env, target_ulong *pc,
 {
     *pc = env->pc;
     /* For a gUSA region, notice the end of the region.  */
-    *cs_base = env->flags & GUSA_MASK ? env->gregs[0] : 0;
-    *flags = env->flags /* TB_FLAG_ENVFLAGS_MASK: bits 0-2, 4-12 */
-            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
-            | (env->sr & ((1u << SR_MD) | (1u << SR_RB)))      /* Bits 29-30 */
-            | (env->sr & (1u << SR_FD))                        /* Bit 15 */
+    *cs_base = env->flags & TB_FLAG_GUSA_MASK ? env->gregs[0] : 0;
+    *flags = env->flags
+            | (env->fpscr & TB_FLAG_FPSCR_MASK)
+            | (env->sr & TB_FLAG_SR_MASK)
             | (env->movcal_backup ? TB_FLAG_PENDING_MOVCA : 0); /* Bit 3 */
 #ifdef CONFIG_USER_ONLY
     *flags |= TB_FLAG_UNALIGN * !env_cpu(env)->prctl_unalign_sigbus;
diff --git a/linux-user/sh4/signal.c b/linux-user/sh4/signal.c
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/sh4/signal.c
+++ b/linux-user/sh4/signal.c
@@ -XXX,XX +XXX,XX @@ static void restore_sigcontext(CPUSH4State *regs, struct target_sigcontext *sc)
     __get_user(regs->fpul, &sc->sc_fpul);
 
     regs->tra = -1;         /* disable syscall checks */
-    regs->flags &= ~(DELAY_SLOT_MASK | GUSA_MASK);
+    regs->flags = 0;
 }
 
 void setup_frame(int sig, struct target_sigaction *ka,
@@ -XXX,XX +XXX,XX @@ void setup_frame(int sig, struct target_sigaction *ka,
     regs->gregs[5] = 0;
     regs->gregs[6] = frame_addr += offsetof(typeof(*frame), sc);
     regs->pc = (unsigned long) ka->_sa_handler;
-    regs->flags &= ~(DELAY_SLOT_MASK | GUSA_MASK);
+    regs->flags &= ~(TB_FLAG_DELAY_SLOT_MASK | TB_FLAG_GUSA_MASK);
 
     unlock_user_struct(frame, frame_addr, 1);
     return;
@@ -XXX,XX +XXX,XX @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
     regs->gregs[5] = frame_addr + offsetof(typeof(*frame), info);
     regs->gregs[6] = frame_addr + offsetof(typeof(*frame), uc);
     regs->pc = (unsigned long) ka->_sa_handler;
-    regs->flags &= ~(DELAY_SLOT_MASK | GUSA_MASK);
+    regs->flags &= ~(TB_FLAG_DELAY_SLOT_MASK | TB_FLAG_GUSA_MASK);
 
     unlock_user_struct(frame, frame_addr, 1);
     return;
diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_synchronize_from_tb(CPUState *cs,
     SuperHCPU *cpu = SUPERH_CPU(cs);
 
     cpu->env.pc = tb_pc(tb);
-    cpu->env.flags = tb->flags & TB_FLAG_ENVFLAGS_MASK;
+    cpu->env.flags = tb->flags;
 }
 
 #ifndef CONFIG_USER_ONLY
@@ -XXX,XX +XXX,XX @@ static bool superh_io_recompile_replay_branch(CPUState *cs,
     SuperHCPU *cpu = SUPERH_CPU(cs);
     CPUSH4State *env = &cpu->env;
 
-    if ((env->flags & ((DELAY_SLOT | DELAY_SLOT_CONDITIONAL))) != 0
+    if ((env->flags & (TB_FLAG_DELAY_SLOT | TB_FLAG_DELAY_SLOT_COND))
         && env->pc != tb_pc(tb)) {
         env->pc -= 2;
-        env->flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
+        env->flags &= ~(TB_FLAG_DELAY_SLOT | TB_FLAG_DELAY_SLOT_COND);
         return true;
     }
     return false;
diff --git a/target/sh4/helper.c b/target/sh4/helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/helper.c
+++ b/target/sh4/helper.c
@@ -XXX,XX +XXX,XX @@ void superh_cpu_do_interrupt(CPUState *cs)
     env->sr |= (1u << SR_BL) | (1u << SR_MD) | (1u << SR_RB);
     env->lock_addr = -1;
 
-    if (env->flags & DELAY_SLOT_MASK) {
+    if (env->flags & TB_FLAG_DELAY_SLOT_MASK) {
         /* Branch instruction should be executed again before delay slot. */
 	env->spc -= 2;
 	/* Clear flags for exception/interrupt routine. */
-        env->flags &= ~DELAY_SLOT_MASK;
+        env->flags &= ~TB_FLAG_DELAY_SLOT_MASK;
     }
 
     if (do_exp) {
@@ -XXX,XX +XXX,XX @@ bool superh_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
         CPUSH4State *env = &cpu->env;
 
         /* Delay slots are indivisible, ignore interrupts */
-        if (env->flags & DELAY_SLOT_MASK) {
+        if (env->flags & TB_FLAG_DELAY_SLOT_MASK) {
             return false;
         } else {
             superh_cpu_do_interrupt(cs);
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@ void superh_cpu_dump_state(CPUState *cs, FILE *f, int flags)
 		    i, env->gregs[i], i + 1, env->gregs[i + 1],
 		    i + 2, env->gregs[i + 2], i + 3, env->gregs[i + 3]);
     }
-    if (env->flags & DELAY_SLOT) {
+    if (env->flags & TB_FLAG_DELAY_SLOT) {
         qemu_printf("in delay slot (delayed_pc=0x%08x)\n",
 		    env->delayed_pc);
-    } else if (env->flags & DELAY_SLOT_CONDITIONAL) {
+    } else if (env->flags & TB_FLAG_DELAY_SLOT_COND) {
         qemu_printf("in conditional delay slot (delayed_pc=0x%08x)\n",
 		    env->delayed_pc);
-    } else if (env->flags & DELAY_SLOT_RTE) {
+    } else if (env->flags & TB_FLAG_DELAY_SLOT_RTE) {
         qemu_fprintf(f, "in rte delay slot (delayed_pc=0x%08x)\n",
                      env->delayed_pc);
     }
@@ -XXX,XX +XXX,XX @@ static inline void gen_save_cpu_state(DisasContext *ctx, bool save_pc)
 
 static inline bool use_exit_tb(DisasContext *ctx)
 {
-    return (ctx->tbflags & GUSA_EXCLUSIVE) != 0;
+    return (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE) != 0;
 }
 
 static bool use_goto_tb(DisasContext *ctx, target_ulong dest)
@@ -XXX,XX +XXX,XX @@ static void gen_conditional_jump(DisasContext *ctx, target_ulong dest,
     TCGLabel *l1 = gen_new_label();
     TCGCond cond_not_taken = jump_if_true ? TCG_COND_EQ : TCG_COND_NE;
 
-    if (ctx->tbflags & GUSA_EXCLUSIVE) {
+    if (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE) {
         /* When in an exclusive region, we must continue to the end.
            Therefore, exit the region on a taken branch, but otherwise
            fall through to the next instruction.  */
         tcg_gen_brcondi_i32(cond_not_taken, cpu_sr_t, 0, l1);
-        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~GUSA_MASK);
+        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~TB_FLAG_GUSA_MASK);
         /* Note that this won't actually use a goto_tb opcode because we
            disallow it in use_goto_tb, but it handles exit + singlestep.  */
         gen_goto_tb(ctx, 0, dest);
@@ -XXX,XX +XXX,XX @@ static void gen_delayed_conditional_jump(DisasContext * ctx)
     tcg_gen_mov_i32(ds, cpu_delayed_cond);
     tcg_gen_discard_i32(cpu_delayed_cond);
 
-    if (ctx->tbflags & GUSA_EXCLUSIVE) {
+    if (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE) {
         /* When in an exclusive region, we must continue to the end.
            Therefore, exit the region on a taken branch, but otherwise
            fall through to the next instruction.  */
         tcg_gen_brcondi_i32(TCG_COND_EQ, ds, 0, l1);
 
         /* Leave the gUSA region.  */
-        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~GUSA_MASK);
+        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~TB_FLAG_GUSA_MASK);
         gen_jump(ctx);
 
         gen_set_label(l1);
@@ -XXX,XX +XXX,XX @@ static inline void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
 #define XHACK(x) ((((x) & 1 ) << 4) | ((x) & 0xe))
 
 #define CHECK_NOT_DELAY_SLOT \
-    if (ctx->envflags & DELAY_SLOT_MASK) {  \
-        goto do_illegal_slot;               \
+    if (ctx->envflags & TB_FLAG_DELAY_SLOT_MASK) {  \
+        goto do_illegal_slot;                       \
     }
 
 #define CHECK_PRIVILEGED \
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
     case 0x000b:		/* rts */
 	CHECK_NOT_DELAY_SLOT
 	tcg_gen_mov_i32(cpu_delayed_pc, cpu_pr);
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	ctx->delayed_pc = (uint32_t) - 1;
 	return;
     case 0x0028:		/* clrmac */
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
 	CHECK_NOT_DELAY_SLOT
         gen_write_sr(cpu_ssr);
 	tcg_gen_mov_i32(cpu_delayed_pc, cpu_spc);
-        ctx->envflags |= DELAY_SLOT_RTE;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT_RTE;
 	ctx->delayed_pc = (uint32_t) - 1;
         ctx->base.is_jmp = DISAS_STOP;
 	return;
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0xe000:		/* mov #imm,Rn */
 #ifdef CONFIG_USER_ONLY
-        /* Detect the start of a gUSA region.  If so, update envflags
-           and end the TB.  This will allow us to see the end of the
-           region (stored in R0) in the next TB.  */
+        /*
+         * Detect the start of a gUSA region (mov #-n, r15).
+         * If so, update envflags and end the TB.  This will allow us
+         * to see the end of the region (stored in R0) in the next TB.
+         */
         if (B11_8 == 15 && B7_0s < 0 &&
             (tb_cflags(ctx->base.tb) & CF_PARALLEL)) {
-            ctx->envflags = deposit32(ctx->envflags, GUSA_SHIFT, 8, B7_0s);
+            ctx->envflags =
+                deposit32(ctx->envflags, TB_FLAG_GUSA_SHIFT, 8, B7_0s);
             ctx->base.is_jmp = DISAS_STOP;
         }
 #endif
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
     case 0xa000:		/* bra disp */
 	CHECK_NOT_DELAY_SLOT
         ctx->delayed_pc = ctx->base.pc_next + 4 + B11_0s * 2;
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	return;
     case 0xb000:		/* bsr disp */
 	CHECK_NOT_DELAY_SLOT
         tcg_gen_movi_i32(cpu_pr, ctx->base.pc_next + 4);
         ctx->delayed_pc = ctx->base.pc_next + 4 + B11_0s * 2;
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
 	CHECK_NOT_DELAY_SLOT
         tcg_gen_xori_i32(cpu_delayed_cond, cpu_sr_t, 1);
         ctx->delayed_pc = ctx->base.pc_next + 4 + B7_0s * 2;
-        ctx->envflags |= DELAY_SLOT_CONDITIONAL;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT_COND;
 	return;
     case 0x8900:		/* bt label */
 	CHECK_NOT_DELAY_SLOT
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
 	CHECK_NOT_DELAY_SLOT
         tcg_gen_mov_i32(cpu_delayed_cond, cpu_sr_t);
         ctx->delayed_pc = ctx->base.pc_next + 4 + B7_0s * 2;
-        ctx->envflags |= DELAY_SLOT_CONDITIONAL;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT_COND;
 	return;
     case 0x8800:		/* cmp/eq #imm,R0 */
         tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, REG(0), B7_0s);
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
     case 0x0023:		/* braf Rn */
 	CHECK_NOT_DELAY_SLOT
         tcg_gen_addi_i32(cpu_delayed_pc, REG(B11_8), ctx->base.pc_next + 4);
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	ctx->delayed_pc = (uint32_t) - 1;
 	return;
     case 0x0003:		/* bsrf Rn */
 	CHECK_NOT_DELAY_SLOT
         tcg_gen_movi_i32(cpu_pr, ctx->base.pc_next + 4);
 	tcg_gen_add_i32(cpu_delayed_pc, REG(B11_8), cpu_pr);
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	ctx->delayed_pc = (uint32_t) - 1;
 	return;
     case 0x4015:		/* cmp/pl Rn */
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
     case 0x402b:		/* jmp @Rn */
 	CHECK_NOT_DELAY_SLOT
 	tcg_gen_mov_i32(cpu_delayed_pc, REG(B11_8));
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	ctx->delayed_pc = (uint32_t) - 1;
 	return;
     case 0x400b:		/* jsr @Rn */
 	CHECK_NOT_DELAY_SLOT
         tcg_gen_movi_i32(cpu_pr, ctx->base.pc_next + 4);
 	tcg_gen_mov_i32(cpu_delayed_pc, REG(B11_8));
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	ctx->delayed_pc = (uint32_t) - 1;
 	return;
     case 0x400e:		/* ldc Rm,SR */
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
     fflush(stderr);
 #endif
  do_illegal:
-    if (ctx->envflags & DELAY_SLOT_MASK) {
+    if (ctx->envflags & TB_FLAG_DELAY_SLOT_MASK) {
  do_illegal_slot:
         gen_save_cpu_state(ctx, true);
         gen_helper_raise_slot_illegal_instruction(cpu_env);
@@ -XXX,XX +XXX,XX @@ static void _decode_opc(DisasContext * ctx)
 
  do_fpu_disabled:
     gen_save_cpu_state(ctx, true);
-    if (ctx->envflags & DELAY_SLOT_MASK) {
+    if (ctx->envflags & TB_FLAG_DELAY_SLOT_MASK) {
         gen_helper_raise_slot_fpu_disable(cpu_env);
     } else {
         gen_helper_raise_fpu_disable(cpu_env);
@@ -XXX,XX +XXX,XX @@ static void decode_opc(DisasContext * ctx)
 
     _decode_opc(ctx);
 
-    if (old_flags & DELAY_SLOT_MASK) {
+    if (old_flags & TB_FLAG_DELAY_SLOT_MASK) {
         /* go out of the delay slot */
-        ctx->envflags &= ~DELAY_SLOT_MASK;
+        ctx->envflags &= ~TB_FLAG_DELAY_SLOT_MASK;
 
         /* When in an exclusive region, we must continue to the end
            for conditional branches.  */
-        if (ctx->tbflags & GUSA_EXCLUSIVE
-            && old_flags & DELAY_SLOT_CONDITIONAL) {
+        if (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE
+            && old_flags & TB_FLAG_DELAY_SLOT_COND) {
             gen_delayed_conditional_jump(ctx);
             return;
         }
         /* Otherwise this is probably an invalid gUSA region.
            Drop the GUSA bits so the next TB doesn't see them.  */
-        ctx->envflags &= ~GUSA_MASK;
+        ctx->envflags &= ~TB_FLAG_GUSA_MASK;
 
         tcg_gen_movi_i32(cpu_flags, ctx->envflags);
-        if (old_flags & DELAY_SLOT_CONDITIONAL) {
+        if (old_flags & TB_FLAG_DELAY_SLOT_COND) {
 	    gen_delayed_conditional_jump(ctx);
         } else {
             gen_jump(ctx);
@@ -XXX,XX +XXX,XX @@ static void decode_gusa(DisasContext *ctx, CPUSH4State *env)
     }
 
     /* The entire region has been translated.  */
-    ctx->envflags &= ~GUSA_MASK;
+    ctx->envflags &= ~TB_FLAG_GUSA_MASK;
     ctx->base.pc_next = pc_end;
     ctx->base.num_insns += max_insns - 1;
     return;
@@ -XXX,XX +XXX,XX @@ static void decode_gusa(DisasContext *ctx, CPUSH4State *env)
 
     /* Restart with the EXCLUSIVE bit set, within a TB run via
        cpu_exec_step_atomic holding the exclusive lock.  */
-    ctx->envflags |= GUSA_EXCLUSIVE;
+    ctx->envflags |= TB_FLAG_GUSA_EXCLUSIVE;
     gen_save_cpu_state(ctx, false);
     gen_helper_exclusive(cpu_env);
     ctx->base.is_jmp = DISAS_NORETURN;
@@ -XXX,XX +XXX,XX @@ static void sh4_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
                   (tbflags & (1 << SR_RB))) * 0x10;
     ctx->fbank = tbflags & FPSCR_FR ? 0x10 : 0;
 
-    if (tbflags & GUSA_MASK) {
+#ifdef CONFIG_USER_ONLY
+    if (tbflags & TB_FLAG_GUSA_MASK) {
+        /* In gUSA exclusive region. */
         uint32_t pc = ctx->base.pc_next;
         uint32_t pc_end = ctx->base.tb->cs_base;
-        int backup = sextract32(ctx->tbflags, GUSA_SHIFT, 8);
+        int backup = sextract32(ctx->tbflags, TB_FLAG_GUSA_SHIFT, 8);
         int max_insns = (pc_end - pc) / 2;
 
         if (pc != pc_end + backup || max_insns < 2) {
             /* This is a malformed gUSA region.  Don't do anything special,
                since the interpreter is likely to get confused.  */
-            ctx->envflags &= ~GUSA_MASK;
-        } else if (tbflags & GUSA_EXCLUSIVE) {
+            ctx->envflags &= ~TB_FLAG_GUSA_MASK;
+        } else if (tbflags & TB_FLAG_GUSA_EXCLUSIVE) {
             /* Regardless of single-stepping or the end of the page,
                we must complete execution of the gUSA region while
                holding the exclusive lock.  */
@@ -XXX,XX +XXX,XX @@ static void sh4_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
             return;
         }
     }
+#endif
 
     /* Since the ISA is fixed-width, we can bound by the number
        of instructions remaining on the page.  */
@@ -XXX,XX +XXX,XX @@ static void sh4_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs)
     DisasContext *ctx = container_of(dcbase, DisasContext, base);
 
 #ifdef CONFIG_USER_ONLY
-    if (unlikely(ctx->envflags & GUSA_MASK)
-        && !(ctx->envflags & GUSA_EXCLUSIVE)) {
+    if (unlikely(ctx->envflags & TB_FLAG_GUSA_MASK)
+        && !(ctx->envflags & TB_FLAG_GUSA_EXCLUSIVE)) {
         /* We're in an gUSA region, and we have not already fallen
            back on using an exclusive region.  Attempt to parse the
            region into a single supported atomic operation.  Failure
@@ -XXX,XX +XXX,XX @@ static void sh4_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
 {
     DisasContext *ctx = container_of(dcbase, DisasContext, base);
 
-    if (ctx->tbflags & GUSA_EXCLUSIVE) {
+    if (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE) {
         /* Ending the region of exclusivity.  Clear the bits.  */
-        ctx->envflags &= ~GUSA_MASK;
+        ctx->envflags &= ~TB_FLAG_GUSA_MASK;
     }
 
     switch (ctx->base.is_jmp) {
-- 
2.34.1

Second pull for this week, since this set is large enough by itself.

The following changes since commit 7c9236d6d61f30583d5d860097d88dbf0fe487bf:

Merge tag 'pull-tcg-20230116' of https://gitlab.com/rth7680/qemu into staging (2023-01-17 10:24:16 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230117

for you to fetch changes up to 493c9b19a7fb7f387c4fcf57d3836504d5242bf5:

tcg/riscv: Implement direct branch for goto_tb (2023-01-17 22:36:17 +0000)

----------------------------------------------------------------
tcg: Fix race conditions in (most) goto_tb implementations

----------------------------------------------------------------
Richard Henderson (22):
      tcg: Split out tcg_out_exit_tb
      tcg/i386: Remove unused goto_tb code for indirect jump
      tcg/ppc: Remove unused goto_tb code for indirect jump
      tcg/sparc64: Remove unused goto_tb code for indirect jump
      tcg: Replace asserts on tcg_jmp_insn_offset
      tcg: Introduce set_jmp_insn_offset
      tcg: Introduce get_jmp_target_addr
      tcg: Split out tcg_out_goto_tb
      tcg: Rename TB_JMP_RESET_OFFSET_INVALID to TB_JMP_OFFSET_INVALID
      tcg: Add gen_tb to TCGContext
      tcg: Add TranslationBlock.jmp_insn_offset
      tcg: Change tb_target_set_jmp_target arguments
      tcg: Move tb_target_set_jmp_target declaration to tcg.h
      tcg: Always define tb_target_set_jmp_target
      tcg: Remove TCG_TARGET_HAS_direct_jump
      tcg/aarch64: Reorg goto_tb implementation
      tcg/ppc: Reorg goto_tb implementation
      tcg/sparc64: Remove USE_REG_TB
      tcg/sparc64: Reorg goto_tb implementation
      tcg/arm: Implement direct branch for goto_tb
      tcg/riscv: Introduce OPC_NOP
      tcg/riscv: Implement direct branch for goto_tb

The INDEX_op_exit_tb opcode needs no register allocation.
Split out a dedicated helper function for it.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                        |  4 ++++
 tcg/aarch64/tcg-target.c.inc     | 22 ++++++++++--------
 tcg/arm/tcg-target.c.inc         | 11 +++++----
 tcg/i386/tcg-target.c.inc        | 21 +++++++++--------
 tcg/loongarch64/tcg-target.c.inc | 22 ++++++++++--------
 tcg/mips/tcg-target.c.inc        | 33 +++++++++++++--------------
 tcg/ppc/tcg-target.c.inc         | 11 +++++----
 tcg/riscv/tcg-target.c.inc       | 22 ++++++++++--------
 tcg/s390x/tcg-target.c.inc       | 23 ++++++++++---------
 tcg/sparc64/tcg-target.c.inc     | 39 +++++++++++++++++---------------
 tcg/tci/tcg-target.c.inc         | 10 ++++----
 11 files changed, 121 insertions(+), 97 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
 static void tcg_out_movi(TCGContext *s, TCGType type,
                          TCGReg ret, tcg_target_long arg);
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS]);
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
         case INDEX_op_call:
             tcg_reg_alloc_call(s, op);
             break;
+        case INDEX_op_exit_tb:
+            tcg_out_exit_tb(s, op->args[0]);
+            break;
         case INDEX_op_dup2_vec:
             if (tcg_reg_alloc_dup2(s, op)) {
                 break;
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
 
 static const tcg_insn_unit *tb_ret_addr;
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tcg_out_goto_long(s, tcg_code_gen_epilogue);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
+        tcg_out_goto_long(s, tb_ret_addr);
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        if (a0 == 0) {
-            tcg_out_goto_long(s, tcg_code_gen_epilogue);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
-            tcg_out_goto_long(s, tb_ret_addr);
-        }
-        break;
-
     case INDEX_op_goto_tb:
         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
         /*
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 
 static void tcg_out_epilogue(TCGContext *s);
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
+{
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, arg);
+    tcg_out_epilogue(s);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c;
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]);
-        tcg_out_epilogue(s);
-        break;
     case INDEX_op_goto_tb:
         {
             /* Indirect jump method */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 #endif
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tcg_out_jmp(s, tcg_code_gen_epilogue);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
+        tcg_out_jmp(s, tb_ret_addr);
+    }
+}
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg args[TCG_MAX_OP_ARGS],
                               const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     const_a2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        if (a0 == 0) {
-            tcg_out_jmp(s, tcg_code_gen_epilogue);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
-            tcg_out_jmp(s, tb_ret_addr);
-        }
-        break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_insn_offset) {
             /* direct jump method */
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
 
 static const tcg_insn_unit *tb_ret_addr;
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tcg_out_call_int(s, tcg_code_gen_epilogue, true);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
+        tcg_out_call_int(s, tb_ret_addr, true);
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        if (a0 == 0) {
-            tcg_out_call_int(s, tcg_code_gen_epilogue, true);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
-            tcg_out_call_int(s, tb_ret_addr, true);
-        }
-        break;
-
     case INDEX_op_goto_tb:
         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
         /*
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_clz(TCGContext *s, MIPSInsn opcv2, MIPSInsn opcv6,
     }
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    TCGReg b0 = TCG_REG_ZERO;
+
+    if (a0 & ~0xffff) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, a0 & ~0xffff);
+        b0 = TCG_REG_V0;
+    }
+    if (!tcg_out_opc_jmp(s, OPC_J, tb_ret_addr)) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, (uintptr_t)tb_ret_addr);
+        tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
+    }
+    tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        {
-            TCGReg b0 = TCG_REG_ZERO;
-
-            a0 = (intptr_t)a0;
-            if (a0 & ~0xffff) {
-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, a0 & ~0xffff);
-                b0 = TCG_REG_V0;
-            }
-            if (!tcg_out_opc_jmp(s, OPC_J, tb_ret_addr)) {
-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0,
-                             (uintptr_t)tb_ret_addr);
-                tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
-            }
-            tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
-        }
-        break;
     case INDEX_op_goto_tb:
         /* indirect jump method */
         tcg_debug_assert(s->tb_jmp_insn_offset == 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
     tcg_out32(s, BCLR | BO_ALWAYS);
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
+{
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, arg);
+    tcg_out_b(s, 0, tcg_code_gen_epilogue);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGArg a0, a1, a2;
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, args[0]);
-        tcg_out_b(s, 0, tcg_code_gen_epilogue);
-        break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_insn_offset) {
             /* Direct jump. */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:   /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:   /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
 
 static const tcg_insn_unit *tb_ret_addr;
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tcg_out_call_int(s, tcg_code_gen_epilogue, true);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
+        tcg_out_call_int(s, tb_ret_addr, true);
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        if (a0 == 0) {
-            tcg_out_call_int(s, tcg_code_gen_epilogue, true);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, a0);
-            tcg_out_call_int(s, tb_ret_addr, true);
-        }
-        break;
-
     case INDEX_op_goto_tb:
         assert(s->tb_jmp_insn_offset == 0);
         /* indirect jump method */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
 #endif
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    /* Reuse the zeroing that exists for goto_ptr.  */
+    if (a0 == 0) {
+        tgen_gotoi(s, S390_CC_ALWAYS, tcg_code_gen_epilogue);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, a0);
+        tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
+    }
+}
+
 # define OP_32_64(x) \
         case glue(glue(INDEX_op_,x),_i32): \
         case glue(glue(INDEX_op_,x),_i64)
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGArg a0, a1, a2;
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        a0 = args[0];
-        if (a0 == 0) {
-            tgen_gotoi(s, S390_CC_ALWAYS, tcg_code_gen_epilogue);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, a0);
-            tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
-        }
-        break;
-
     case INDEX_op_goto_tb:
         a0 = args[0];
         /*
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
 #endif /* CONFIG_SOFTMMU */
 }
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+{
+    if (check_fit_ptr(a0, 13)) {
+        tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+        tcg_out_movi_imm13(s, TCG_REG_O0, a0);
+        return;
+    } else if (USE_REG_TB) {
+        intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
+        if (check_fit_ptr(tb_diff, 13)) {
+            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+            /* Note that TCG_REG_TB has been unwound to O1.  */
+            tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O1, tb_diff, ARITH_ADD);
+            return;
+        }
+    }
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I0, a0 & ~0x3ff);
+    tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
+    tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        if (check_fit_ptr(a0, 13)) {
-            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
-            tcg_out_movi_imm13(s, TCG_REG_O0, a0);
-            break;
-        } else if (USE_REG_TB) {
-            intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
-            if (check_fit_ptr(tb_diff, 13)) {
-                tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
-                /* Note that TCG_REG_TB has been unwound to O1.  */
-                tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O1, tb_diff, ARITH_ADD);
-                break;
-            }
-        }
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I0, a0 & ~0x3ff);
-        tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
-        tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
-        break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_insn_offset) {
             /* direct jump method */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *func,
 # define CASE_64(x)
 #endif
 
+static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
+{
+    tcg_out_op_p(s, INDEX_op_exit_tb, (void *)arg);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGOpcode exts;
 
     switch (opc) {
-    case INDEX_op_exit_tb:
-        tcg_out_op_p(s, opc, (void *)args[0]);
-        break;
-
     case INDEX_op_goto_tb:
         tcg_debug_assert(s->tb_jmp_insn_offset == 0);
         /* indirect jump method. */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
     default:
         tcg_abort();
     }
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        if (s->tb_jmp_insn_offset) {
-            /* direct jump method */
-            int gap;
-            /* jump displacement must be aligned for atomic patching;
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+        {
+            /*
+             * Jump displacement must be aligned for atomic patching;
              * see if we need to add extra nops before jump
              */
-            gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
+            int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
             if (gap != 1) {
                 tcg_out_nopn(s, gap - 1);
             }
             tcg_out8(s, OPC_JMP_long); /* jmp im */
             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
             tcg_out32(s, 0);
-        } else {
-            /* indirect jump method */
-            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
-                                 (intptr_t)(s->tb_jmp_target_addr + a0));
         }
         set_jmp_reset_offset(s, a0);
         break;
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        if (s->tb_jmp_insn_offset) {
-            /* Direct jump. */
-            if (TCG_TARGET_REG_BITS == 64) {
-                /* Ensure the next insns are 8 or 16-byte aligned. */
-                while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
-                    tcg_out32(s, NOP);
-                }
-                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
-                tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-                tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-            } else {
-                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
-                tcg_out32(s, B);
-                s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
-                break;
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+        /* Direct jump. */
+        if (TCG_TARGET_REG_BITS == 64) {
+            /* Ensure the next insns are 8 or 16-byte aligned. */
+            while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
+                tcg_out32(s, NOP);
             }
+            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+            tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
         } else {
-            /* Indirect jump. */
-            tcg_debug_assert(s->tb_jmp_insn_offset == NULL);
-            tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, 0,
-                       (intptr_t)(s->tb_jmp_insn_offset + args[0]));
+            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+            tcg_out32(s, B);
+            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+            break;
         }
         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
         tcg_out32(s, BCCTR | BO_ALWAYS);
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc64/tcg-target.c.inc | 41 +++++++++++-------------------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
     return false;
 }
 
-static void tcg_out_ld_ptr(TCGContext *s, TCGReg ret, const void *arg)
-{
-    intptr_t diff = tcg_tbrel_diff(s, arg);
-    if (USE_REG_TB && check_fit_ptr(diff, 13)) {
-        tcg_out_ld(s, TCG_TYPE_PTR, ret, TCG_REG_TB, diff);
-        return;
-    }
-    tcg_out_movi(s, TCG_TYPE_PTR, ret, (uintptr_t)arg & ~0x3ff);
-    tcg_out_ld(s, TCG_TYPE_PTR, ret, ret, (uintptr_t)arg & 0x3ff);
-}
-
 static void tcg_out_sety(TCGContext *s, TCGReg rs)
 {
     tcg_out32(s, WRY | INSN_RS1(TCG_REG_G0) | INSN_RS2(rs));
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        if (s->tb_jmp_insn_offset) {
-            /* direct jump method */
-            if (USE_REG_TB) {
-                /* make sure the patch is 8-byte aligned.  */
-                if ((intptr_t)s->code_ptr & 4) {
-                    tcg_out_nop(s);
-                }
-                s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
-                tcg_out_sethi(s, TCG_REG_T1, 0);
-                tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-                tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-                tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-            } else {
-                s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
-                tcg_out32(s, CALL);
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
+        /* Direct jump. */
+        if (USE_REG_TB) {
+            /* make sure the patch is 8-byte aligned.  */
+            if ((intptr_t)s->code_ptr & 4) {
                 tcg_out_nop(s);
             }
+            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            tcg_out_sethi(s, TCG_REG_T1, 0);
+            tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+            tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
         } else {
-            /* indirect jump method */
-            tcg_out_ld_ptr(s, TCG_REG_TB, s->tb_jmp_target_addr + a0);
-            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_TB, 0, JMPL);
+            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            tcg_out32(s, CALL);
             tcg_out_nop(s);
         }
         set_jmp_reset_offset(s, a0);
-- 
2.34.1

Test TCG_TARGET_HAS_direct_jump instead of testing an
implementation pointer.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.c.inc     | 2 +-
 tcg/arm/tcg-target.c.inc         | 2 +-
 tcg/loongarch64/tcg-target.c.inc | 2 +-
 tcg/mips/tcg-target.c.inc        | 2 +-
 tcg/riscv/tcg-target.c.inc       | 2 +-
 tcg/tci/tcg-target.c.inc         | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /*
          * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
          * write can be used to patch the target address.
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
             intptr_t ptr, dif, dil;
             TCGReg base = TCG_REG_PC;
 
-            tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+            qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
             ptr = (intptr_t)tcg_splitwx_to_rx(s->tb_jmp_target_addr + args[0]);
             dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
             dil = sextract32(dif, 0, 12);
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /*
          * Ensure that patch area is 8-byte aligned so that an
          * atomic write can be used to patch the target address.
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     switch (opc) {
     case INDEX_op_goto_tb:
         /* indirect jump method */
-        tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
                    (uintptr_t)(s->tb_jmp_target_addr + a0));
         tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        assert(s->tb_jmp_insn_offset == 0);
+        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         /* indirect jump method */
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
                    (uintptr_t)(s->tb_jmp_target_addr + a0));
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         /* indirect jump method. */
         tcg_out_op_p(s, opc, s->tb_jmp_target_addr + args[0]);
         set_jmp_reset_offset(s, args[0]);
-- 
2.34.1

Similar to the existing set_jmp_reset_offset.  Move any assert for
TCG_TARGET_HAS_direct_jump into the new function (which now cannot
be build-time).  Will be unused if TCG_TARGET_HAS_direct_jump is
constant 0, but we can't test for constant in the preprocessor,
so just mark it G_GNUC_UNUSED.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                        | 10 ++++++++++
 tcg/aarch64/tcg-target.c.inc     |  3 +--
 tcg/i386/tcg-target.c.inc        |  3 +--
 tcg/loongarch64/tcg-target.c.inc |  3 +--
 tcg/ppc/tcg-target.c.inc         |  7 +++----
 tcg/s390x/tcg-target.c.inc       |  2 +-
 tcg/sparc64/tcg-target.c.inc     |  5 ++---
 7 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
     s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
 }
 
+static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
+{
+    /*
+     * We will check for overflow at the end of the opcode loop in
+     * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
+     */
+    tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
+    s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
+}
+
 /* Signal overflow, starting over with fewer guest insns. */
 static G_NORETURN
 void tcg_raise_tb_overflow(TCGContext *s)
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /*
          * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
          * write can be used to patch the target address.
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         if ((uintptr_t)s->code_ptr & 7) {
             tcg_out32(s, NOP);
         }
-        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+        set_jmp_insn_offset(s, a0);
         /*
          * actual branch destination will be patched by
          * tb_target_set_jmp_target later
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         {
             /*
              * Jump displacement must be aligned for atomic patching;
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                 tcg_out_nopn(s, gap - 1);
             }
             tcg_out8(s, OPC_JMP_long); /* jmp im */
-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, a0);
             tcg_out32(s, 0);
         }
         set_jmp_reset_offset(s, a0);
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /*
          * Ensure that patch area is 8-byte aligned so that an
          * atomic write can be used to patch the target address.
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         if ((uintptr_t)s->code_ptr & 7) {
             tcg_out_nop(s);
         }
-        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+        set_jmp_insn_offset(s, a0);
         /*
          * actual branch destination will be patched by
          * tb_target_set_jmp_target later
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /* Direct jump. */
         if (TCG_TARGET_REG_BITS == 64) {
             /* Ensure the next insns are 8 or 16-byte aligned. */
             while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
                 tcg_out32(s, NOP);
             }
-            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, args[0]);
             tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
             tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
         } else {
-            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, args[0]);
             tcg_out32(s, B);
-            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+            set_jmp_reset_offset(s, args[0]);
             break;
         }
         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
             tcg_out16(s, NOP);
         }
         tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
-        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+        set_jmp_insn_offset(s, a0);
         s->code_ptr += 2;
         set_jmp_reset_offset(s, a0);
         break;
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     switch (opc) {
     case INDEX_op_goto_tb:
-        qemu_build_assert(TCG_TARGET_HAS_direct_jump);
         /* Direct jump. */
         if (USE_REG_TB) {
             /* make sure the patch is 8-byte aligned.  */
             if ((intptr_t)s->code_ptr & 4) {
                 tcg_out_nop(s);
             }
-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, a0);
             tcg_out_sethi(s, TCG_REG_T1, 0);
             tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
             tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
             tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
         } else {
-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+            set_jmp_insn_offset(s, a0);
             tcg_out32(s, CALL);
             tcg_out_nop(s);
         }
-- 
2.34.1

Similar to the existing set_jmp_reset_offset.  Include the
rw->rx address space conversion done by arm and s390x, and
forgotten by mips and riscv.

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
     s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
 }
 
+static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
+{
+    /*
+     * Return the read-execute version of the pointer, for the benefit
+     * of any pc-relative addressing mode.
+     */
+    return (uintptr_t)tcg_splitwx_to_rx(&s->tb_jmp_target_addr[which]);
+}
+
 /* Signal overflow, starting over with fewer guest insns. */
 static G_NORETURN
 void tcg_raise_tb_overflow(TCGContext *s)
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
             TCGReg base = TCG_REG_PC;
 
             qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-            ptr = (intptr_t)tcg_splitwx_to_rx(s->tb_jmp_target_addr + args[0]);
+            ptr = get_jmp_target_addr(s, args[0]);
             dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
             dil = sextract32(dif, 0, 12);
             if (dif != dil) {
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         /* indirect jump method */
         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
-                   (uintptr_t)(s->tb_jmp_target_addr + a0));
+                   get_jmp_target_addr(s, a0));
         tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
         tcg_out_nop(s);
         set_jmp_reset_offset(s, a0);
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         /* indirect jump method */
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
-                   (uintptr_t)(s->tb_jmp_target_addr + a0));
+                   get_jmp_target_addr(s, a0));
         tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
         set_jmp_reset_offset(s, a0);
         break;
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_goto_tb:
         qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
         /* indirect jump method. */
-        tcg_out_op_p(s, opc, s->tb_jmp_target_addr + args[0]);
+        tcg_out_op_p(s, opc, (void *)get_jmp_target_addr(s, args[0]));
         set_jmp_reset_offset(s, args[0]);
         break;
 
-- 
2.34.1

The INDEX_op_goto_tb opcode needs no register allocation.
Split out a dedicated helper function for it.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                        |  4 ++
 tcg/aarch64/tcg-target.c.inc     | 40 ++++++++++---------
 tcg/arm/tcg-target.c.inc         | 49 ++++++++++++-----------
 tcg/i386/tcg-target.c.inc        | 33 ++++++++--------
 tcg/loongarch64/tcg-target.c.inc | 38 +++++++++---------
 tcg/mips/tcg-target.c.inc        | 21 +++++-----
 tcg/ppc/tcg-target.c.inc         | 52 ++++++++++++------------
 tcg/riscv/tcg-target.c.inc       | 20 +++++-----
 tcg/s390x/tcg-target.c.inc       | 31 ++++++++-------
 tcg/sparc64/tcg-target.c.inc     | 68 +++++++++++++++++---------------
 tcg/tci/tcg-target.c.inc         | 16 ++++----
 11 files changed, 199 insertions(+), 173 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
 static void tcg_out_movi(TCGContext *s, TCGType type,
                          TCGReg ret, tcg_target_long arg);
 static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
+static void tcg_out_goto_tb(TCGContext *s, int which);
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS]);
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
         case INDEX_op_exit_tb:
             tcg_out_exit_tb(s, op->args[0]);
             break;
+        case INDEX_op_goto_tb:
+            tcg_out_goto_tb(s, op->args[0]);
+            break;
         case INDEX_op_dup2_vec:
             if (tcg_reg_alloc_dup2(s, op)) {
                 break;
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /*
+     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
+     * write can be used to patch the target address.
+     */
+    if ((uintptr_t)s->code_ptr & 7) {
+        tcg_out32(s, NOP);
+    }
+    set_jmp_insn_offset(s, which);
+    /*
+     * actual branch destination will be patched by
+     * tb_target_set_jmp_target later
+     */
+    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
+    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
+    tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /*
-         * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
-         * write can be used to patch the target address.
-         */
-        if ((uintptr_t)s->code_ptr & 7) {
-            tcg_out32(s, NOP);
-        }
-        set_jmp_insn_offset(s, a0);
-        /*
-         * actual branch destination will be patched by
-         * tb_target_set_jmp_target later
-         */
-        tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
-        tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
-        tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
-        set_jmp_reset_offset(s, a0);
-        break;
-
     case INDEX_op_goto_ptr:
         tcg_out_insn(s, 3207, BR, a0);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
     tcg_out_epilogue(s);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /* Indirect jump method */
+    intptr_t ptr, dif, dil;
+    TCGReg base = TCG_REG_PC;
+
+    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+    ptr = get_jmp_target_addr(s, which);
+    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
+    dil = sextract32(dif, 0, 12);
+    if (dif != dil) {
+        /*
+         * The TB is close, but outside the 12 bits addressable by
+         * the load.  We can extend this to 20 bits with a sub of a
+         * shifted immediate from pc.  In the vastly unlikely event
+         * the code requires more than 1MB, we'll use 2 insns and
+         * be no worse off.
+         */
+        base = TCG_REG_R0;
+        tcg_out_movi32(s, COND_AL, base, ptr - dil);
+    }
+    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c;
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        {
-            /* Indirect jump method */
-            intptr_t ptr, dif, dil;
-            TCGReg base = TCG_REG_PC;
-
-            qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-            ptr = get_jmp_target_addr(s, args[0]);
-            dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
-            dil = sextract32(dif, 0, 12);
-            if (dif != dil) {
-                /* The TB is close, but outside the 12 bits addressable by
-                   the load.  We can extend this to 20 bits with a sub of a
-                   shifted immediate from pc.  In the vastly unlikely event
-                   the code requires more than 1MB, we'll use 2 insns and
-                   be no worse off.  */
-                base = TCG_REG_R0;
-                tcg_out_movi32(s, COND_AL, base, ptr - dil);
-            }
-            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
-            set_jmp_reset_offset(s, args[0]);
-        }
-        break;
     case INDEX_op_goto_ptr:
         tcg_out_b_reg(s, COND_AL, args[0]);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /*
+     * Jump displacement must be aligned for atomic patching;
+     * see if we need to add extra nops before jump
+     */
+    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
+    if (gap != 1) {
+        tcg_out_nopn(s, gap - 1);
+    }
+    tcg_out8(s, OPC_JMP_long); /* jmp im */
+    set_jmp_insn_offset(s, which);
+    tcg_out32(s, 0);
+    set_jmp_reset_offset(s, which);
+}
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg args[TCG_MAX_OP_ARGS],
                               const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     const_a2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        {
-            /*
-             * Jump displacement must be aligned for atomic patching;
-             * see if we need to add extra nops before jump
-             */
-            int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
-            if (gap != 1) {
-                tcg_out_nopn(s, gap - 1);
-            }
-            tcg_out8(s, OPC_JMP_long); /* jmp im */
-            set_jmp_insn_offset(s, a0);
-            tcg_out32(s, 0);
-        }
-        set_jmp_reset_offset(s, a0);
-        break;
     case INDEX_op_goto_ptr:
         /* jmp to the given host address (could be epilogue) */
         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /*
+     * Ensure that patch area is 8-byte aligned so that an
+     * atomic write can be used to patch the target address.
+     */
+    if ((uintptr_t)s->code_ptr & 7) {
+        tcg_out_nop(s);
+    }
+    set_jmp_insn_offset(s, which);
+    /*
+     * actual branch destination will be patched by
+     * tb_target_set_jmp_target later
+     */
+    tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
+    tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /*
-         * Ensure that patch area is 8-byte aligned so that an
-         * atomic write can be used to patch the target address.
-         */
-        if ((uintptr_t)s->code_ptr & 7) {
-            tcg_out_nop(s);
-        }
-        set_jmp_insn_offset(s, a0);
-        /*
-         * actual branch destination will be patched by
-         * tb_target_set_jmp_target later
-         */
-        tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
-        tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
-        set_jmp_reset_offset(s, a0);
-        break;
-
     case INDEX_op_mb:
         tcg_out_mb(s, a0);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /* indirect jump method */
+    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
+               get_jmp_target_addr(s, which));
+    tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
+    tcg_out_nop(s);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /* indirect jump method */
-        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
-                   get_jmp_target_addr(s, a0));
-        tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
-        tcg_out_nop(s);
-        set_jmp_reset_offset(s, a0);
-        break;
     case INDEX_op_goto_ptr:
         /* jmp to the given host address (could be epilogue) */
         tcg_out_opc_reg(s, OPC_JR, 0, a0, 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
     tcg_out_b(s, 0, tcg_code_gen_epilogue);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /* Direct jump. */
+    if (TCG_TARGET_REG_BITS == 64) {
+        /* Ensure the next insns are 8 or 16-byte aligned. */
+        while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
+            tcg_out32(s, NOP);
+        }
+        set_jmp_insn_offset(s, which);
+        tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
+        tcg_out32(s, BCCTR | BO_ALWAYS);
+        set_jmp_reset_offset(s, which);
+        if (USE_REG_TB) {
+            /* For the unlinked case, need to reset TCG_REG_TB.  */
+            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
+                             -tcg_current_code_size(s));
+        }
+    } else {
+        set_jmp_insn_offset(s, which);
+        tcg_out32(s, B);
+        set_jmp_reset_offset(s, which);
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGArg a0, a1, a2;
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /* Direct jump. */
-        if (TCG_TARGET_REG_BITS == 64) {
-            /* Ensure the next insns are 8 or 16-byte aligned. */
-            while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
-                tcg_out32(s, NOP);
-            }
-            set_jmp_insn_offset(s, args[0]);
-            tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-        } else {
-            set_jmp_insn_offset(s, args[0]);
-            tcg_out32(s, B);
-            set_jmp_reset_offset(s, args[0]);
-            break;
-        }
-        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-        set_jmp_reset_offset(s, args[0]);
-        if (USE_REG_TB) {
-            /* For the unlinked case, need to reset TCG_REG_TB.  */
-            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
-                             -tcg_current_code_size(s));
-        }
-        break;
     case INDEX_op_goto_ptr:
         tcg_out32(s, MTSPR | RS(args[0]) | CTR);
         if (USE_REG_TB) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:   /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:   /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+    /* indirect jump method */
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
+               get_jmp_target_addr(s, which));
+    tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     int c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-        /* indirect jump method */
-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
-                   get_jmp_target_addr(s, a0));
-        tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
-        set_jmp_reset_offset(s, a0);
-        break;
-
     case INDEX_op_goto_ptr:
         tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, a0, 0);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     }
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /*
+     * Branch displacement must be aligned for atomic patching;
+     * see if we need to add extra nop before branch
+     */
+    if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
+        tcg_out16(s, NOP);
+    }
+    tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
+    set_jmp_insn_offset(s, which);
+    s->code_ptr += 2;
+    set_jmp_reset_offset(s, which);
+}
+
 # define OP_32_64(x) \
         case glue(glue(INDEX_op_,x),_i32): \
         case glue(glue(INDEX_op_,x),_i64)
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGArg a0, a1, a2;
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        a0 = args[0];
-        /*
-         * branch displacement must be aligned for atomic patching;
-         * see if we need to add extra nop before branch
-         */
-        if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
-            tcg_out16(s, NOP);
-        }
-        tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
-        set_jmp_insn_offset(s, a0);
-        s->code_ptr += 2;
-        set_jmp_reset_offset(s, a0);
-        break;
-
     case INDEX_op_goto_ptr:
         a0 = args[0];
         tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, a0);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
     tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, a0 & 0x3ff, ARITH_OR);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    /* Direct jump. */
+    if (USE_REG_TB) {
+        /* make sure the patch is 8-byte aligned.  */
+        if ((intptr_t)s->code_ptr & 4) {
+            tcg_out_nop(s);
+        }
+        set_jmp_insn_offset(s, which);
+        tcg_out_sethi(s, TCG_REG_T1, 0);
+        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+        tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+    } else {
+        set_jmp_insn_offset(s, which);
+        tcg_out32(s, CALL);
+        tcg_out_nop(s);
+    }
+    set_jmp_reset_offset(s, which);
+
+    /*
+     * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
+     * to the beginning of this TB.
+     */
+    if (USE_REG_TB) {
+        int c = -tcg_current_code_size(s);
+        if (check_fit_i32(c, 13)) {
+            tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
+        } else {
+            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
+            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+        }
+    }
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     c2 = const_args[2];
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        /* Direct jump. */
-        if (USE_REG_TB) {
-            /* make sure the patch is 8-byte aligned.  */
-            if ((intptr_t)s->code_ptr & 4) {
-                tcg_out_nop(s);
-            }
-            set_jmp_insn_offset(s, a0);
-            tcg_out_sethi(s, TCG_REG_T1, 0);
-            tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-            tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-        } else {
-            set_jmp_insn_offset(s, a0);
-            tcg_out32(s, CALL);
-            tcg_out_nop(s);
-        }
-        set_jmp_reset_offset(s, a0);
-
-        /* For the unlinked path of goto_tb, we need to reset
-           TCG_REG_TB to the beginning of this TB.  */
-        if (USE_REG_TB) {
-            c = -tcg_current_code_size(s);
-            if (check_fit_i32(c, 13)) {
-                tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
-            } else {
-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
-                tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB,
-                              TCG_REG_T1, ARITH_ADD);
-            }
-        }
-        break;
     case INDEX_op_goto_ptr:
         tcg_out_arithi(s, TCG_REG_G0, a0, 0, JMPL);
         if (USE_REG_TB) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
     tcg_out_op_p(s, INDEX_op_exit_tb, (void *)arg);
 }
 
+static void tcg_out_goto_tb(TCGContext *s, int which)
+{
+    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
+    /* indirect jump method. */
+    tcg_out_op_p(s, INDEX_op_goto_tb, (void *)get_jmp_target_addr(s, which));
+    set_jmp_reset_offset(s, which);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGOpcode exts;
 
     switch (opc) {
-    case INDEX_op_goto_tb:
-        qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
-        /* indirect jump method. */
-        tcg_out_op_p(s, opc, (void *)get_jmp_target_addr(s, args[0]));
-        set_jmp_reset_offset(s, args[0]);
-        break;
-
     case INDEX_op_goto_ptr:
         tcg_out_op_r(s, opc, args[0]);
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mov_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
+    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
     default:
         tcg_abort();
     }
-- 
2.34.1

This will shortly be used for more than reset.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h   | 2 +-
 accel/tcg/translate-all.c | 8 ++++----
 tcg/tcg.c                 | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
      * setting one of the jump targets (or patching the jump instruction). Only
      * two of such jumps are supported.
      */
+#define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
     uint16_t jmp_reset_offset[2]; /* offset of original jump target */
-#define TB_JMP_RESET_OFFSET_INVALID 0xffff /* indicates no jump generated */
     uintptr_t jmp_target_arg[2];  /* target address or offset */
 
     /*
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     tb->jmp_dest[1] = (uintptr_t)NULL;
 
     /* init original jump addresses which have been set during tcg_gen_code() */
-    if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
+    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
         tb_reset_jump(tb, 0);
     }
-    if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
+    if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
         tb_reset_jump(tb, 1);
     }
 
@@ -XXX,XX +XXX,XX @@ static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
     if (tb_page_addr1(tb) != -1) {
         tst->cross_page++;
     }
-    if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
+    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
         tst->direct_jmp_count++;
-        if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
+        if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
             tst->direct_jmp2_count++;
         }
     }
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
 #endif
 
     /* Initialize goto_tb jump offsets. */
-    tb->jmp_reset_offset[0] = TB_JMP_RESET_OFFSET_INVALID;
-    tb->jmp_reset_offset[1] = TB_JMP_RESET_OFFSET_INVALID;
+    tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
+    tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
     tcg_ctx->tb_jmp_reset_offset = tb->jmp_reset_offset;
     if (TCG_TARGET_HAS_direct_jump) {
         tcg_ctx->tb_jmp_insn_offset = tb->jmp_target_arg;
-- 
2.34.1

This can replace four other variables that are references
into the TranslationBlock structure.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h         | 11 +++--------
 accel/tcg/translate-all.c |  2 +-
 tcg/tcg-op.c              | 14 +++++++-------
 tcg/tcg.c                 | 14 +++-----------
 4 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
     int nb_indirects;
     int nb_ops;
 
-    /* goto_tb support */
-    tcg_insn_unit *code_buf;
-    uint16_t *tb_jmp_reset_offset; /* tb->jmp_reset_offset */
-    uintptr_t *tb_jmp_insn_offset; /* tb->jmp_target_arg if direct_jump */
-    uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_arg if !direct_jump */
-
     TCGRegSet reserved_regs;
-    uint32_t tb_cflags; /* cflags of the current TB */
     intptr_t current_frame_offset;
     intptr_t frame_start;
     intptr_t frame_end;
     TCGTemp *frame_temp;
 
-    tcg_insn_unit *code_ptr;
+    TranslationBlock *gen_tb;     /* tb for which code is being generated */
+    tcg_insn_unit *code_buf;      /* pointer for start of tb */
+    tcg_insn_unit *code_ptr;      /* pointer for running end of tb */
 
 #ifdef CONFIG_PROFILER
     TCGProfile prof;
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     tb->trace_vcpu_dstate = *cpu->trace_dstate;
     tb_set_page_addr0(tb, phys_pc);
     tb_set_page_addr1(tb, -1);
-    tcg_ctx->tb_cflags = cflags;
+    tcg_ctx->gen_tb = tb;
  tb_overflow:
 
 #ifdef CONFIG_PROFILER
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_op6(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3,
 
 void tcg_gen_mb(TCGBar mb_type)
 {
-    if (tcg_ctx->tb_cflags & CF_PARALLEL) {
+    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {
         tcg_gen_op1(INDEX_op_mb, mb_type);
     }
 }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_exit_tb(const TranslationBlock *tb, unsigned idx)
 void tcg_gen_goto_tb(unsigned idx)
 {
     /* We tested CF_NO_GOTO_TB in translator_use_goto_tb. */
-    tcg_debug_assert(!(tcg_ctx->tb_cflags & CF_NO_GOTO_TB));
+    tcg_debug_assert(!(tcg_ctx->gen_tb->cflags & CF_NO_GOTO_TB));
     /* We only support two chained exits.  */
     tcg_debug_assert(idx <= TB_EXIT_IDXMAX);
 #ifdef CONFIG_DEBUG_TCG
@@ -XXX,XX +XXX,XX @@ void tcg_gen_lookup_and_goto_ptr(void)
 {
     TCGv_ptr ptr;
 
-    if (tcg_ctx->tb_cflags & CF_NO_GOTO_PTR) {
+    if (tcg_ctx->gen_tb->cflags & CF_NO_GOTO_PTR) {
         tcg_gen_exit_tb(NULL, 0);
         return;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
 {
     memop = tcg_canonicalize_memop(memop, 0, 0);
 
-    if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) {
+    if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) {
         TCGv_i32 t1 = tcg_temp_new_i32();
         TCGv_i32 t2 = tcg_temp_new_i32();
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
 {
     memop = tcg_canonicalize_memop(memop, 1, 0);
 
-    if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) {
+    if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) {
         TCGv_i64 t1 = tcg_temp_new_i64();
         TCGv_i64 t2 = tcg_temp_new_i64();
 
@@ -XXX,XX +XXX,XX @@ static void * const table_##NAME[(MO_SIZE | MO_BSWAP) + 1] = {          \
 void tcg_gen_atomic_##NAME##_i32                                        \
     (TCGv_i32 ret, TCGv addr, TCGv_i32 val, TCGArg idx, MemOp memop)    \
 {                                                                       \
-    if (tcg_ctx->tb_cflags & CF_PARALLEL) {                             \
+    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {                        \
         do_atomic_op_i32(ret, addr, val, idx, memop, table_##NAME);     \
     } else {                                                            \
         do_nonatomic_op_i32(ret, addr, val, idx, memop, NEW,            \
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_##NAME##_i32                                        \
 void tcg_gen_atomic_##NAME##_i64                                        \
     (TCGv_i64 ret, TCGv addr, TCGv_i64 val, TCGArg idx, MemOp memop)    \
 {                                                                       \
-    if (tcg_ctx->tb_cflags & CF_PARALLEL) {                             \
+    if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) {                        \
         do_atomic_op_i64(ret, addr, val, idx, memop, table_##NAME);     \
     } else {                                                            \
         do_nonatomic_op_i64(ret, addr, val, idx, memop, NEW,            \
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
      * We will check for overflow at the end of the opcode loop in
      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
      */
-    s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
+    s->gen_tb->jmp_reset_offset[which] = tcg_current_code_size(s);
 }
 
 static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
      */
     tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
-    s->tb_jmp_insn_offset[which] = tcg_current_code_size(s);
+    s->gen_tb->jmp_target_arg[which] = tcg_current_code_size(s);
 }
 
 static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
      * Return the read-execute version of the pointer, for the benefit
      * of any pc-relative addressing mode.
      */
-    return (uintptr_t)tcg_splitwx_to_rx(&s->tb_jmp_target_addr[which]);
+    return (uintptr_t)tcg_splitwx_to_rx(s->gen_tb->jmp_target_arg + which);
 }
 
 /* Signal overflow, starting over with fewer guest insns. */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
     /* Initialize goto_tb jump offsets. */
     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
     tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
-    tcg_ctx->tb_jmp_reset_offset = tb->jmp_reset_offset;
-    if (TCG_TARGET_HAS_direct_jump) {
-        tcg_ctx->tb_jmp_insn_offset = tb->jmp_target_arg;
-        tcg_ctx->tb_jmp_target_addr = NULL;
-    } else {
-        tcg_ctx->tb_jmp_insn_offset = NULL;
-        tcg_ctx->tb_jmp_target_addr = tb->jmp_target_arg;
-    }
 
     tcg_reg_alloc_start(s);
 
-- 
2.34.1

Stop overloading jmp_target_arg for both offset and address,
depending on TCG_TARGET_HAS_direct_jump.  Instead, add a new
field to hold the jump insn offset and always set the target
address in jmp_target_addr[].  This will allow a tcg backend
to use either direct or indirect depending on displacement.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h | 3 ++-
 accel/tcg/cpu-exec.c    | 5 ++---
 tcg/tcg.c               | 6 ++++--
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
      */
 #define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
     uint16_t jmp_reset_offset[2]; /* offset of original jump target */
-    uintptr_t jmp_target_arg[2];  /* target address or offset */
+    uint16_t jmp_insn_offset[2];  /* offset of direct jump insn */
+    uintptr_t jmp_target_addr[2]; /* target address */
 
     /*
      * Each TB has a NULL-terminated list (jmp_list_head) of incoming jumps.
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
 
 void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
 {
+    tb->jmp_target_addr[n] = addr;
     if (TCG_TARGET_HAS_direct_jump) {
-        uintptr_t offset = tb->jmp_target_arg[n];
+        uintptr_t offset = tb->jmp_insn_offset[n];
         uintptr_t tc_ptr = (uintptr_t)tb->tc.ptr;
         uintptr_t jmp_rx = tc_ptr + offset;
         uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
         tb_target_set_jmp_target(tc_ptr, jmp_rx, jmp_rw, addr);
-    } else {
-        tb->jmp_target_arg[n] = addr;
     }
 }
 
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
      */
     tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
-    s->gen_tb->jmp_target_arg[which] = tcg_current_code_size(s);
+    s->gen_tb->jmp_insn_offset[which] = tcg_current_code_size(s);
 }
 
 static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
      * Return the read-execute version of the pointer, for the benefit
      * of any pc-relative addressing mode.
      */
-    return (uintptr_t)tcg_splitwx_to_rx(s->gen_tb->jmp_target_arg + which);
+    return (uintptr_t)tcg_splitwx_to_rx(&s->gen_tb->jmp_target_addr[which]);
 }
 
 /* Signal overflow, starting over with fewer guest insns. */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
     /* Initialize goto_tb jump offsets. */
     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
     tb->jmp_reset_offset[1] = TB_JMP_OFFSET_INVALID;
+    tb->jmp_insn_offset[0] = TB_JMP_OFFSET_INVALID;
+    tb->jmp_insn_offset[1] = TB_JMP_OFFSET_INVALID;
 
     tcg_reg_alloc_start(s);
 
-- 
2.34.1

Replace 'tc_ptr' and 'addr' with 'tb' and 'n'.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.h         |  3 ++-
 tcg/arm/tcg-target.h             |  3 ++-
 tcg/i386/tcg-target.h            |  9 ++-------
 tcg/loongarch64/tcg-target.h     |  3 ++-
 tcg/mips/tcg-target.h            |  3 ++-
 tcg/ppc/tcg-target.h             |  3 ++-
 tcg/riscv/tcg-target.h           |  3 ++-
 tcg/s390x/tcg-target.h           | 10 ++--------
 tcg/sparc64/tcg-target.h         |  3 ++-
 tcg/tci/tcg-target.h             |  3 ++-
 accel/tcg/cpu-exec.c             | 11 ++++++++---
 tcg/aarch64/tcg-target.c.inc     |  5 +++--
 tcg/i386/tcg-target.c.inc        |  9 +++++++++
 tcg/loongarch64/tcg-target.c.inc |  5 +++--
 tcg/ppc/tcg-target.c.inc         |  7 ++++---
 tcg/s390x/tcg-target.c.inc       | 10 ++++++++++
 tcg/sparc64/tcg-target.c.inc     |  7 ++++---
 17 files changed, 61 insertions(+), 36 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     0
 
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *, int,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_MEMORY_BSWAP     0
 
 /* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
 #define TCG_TARGET_extract_i64_valid(ofs, len) \
     (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
 
-static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                                            uintptr_t jmp_rw, uintptr_t addr)
-{
-    /* patch the branch destination */
-    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
-    /* no need to flush icache explicitly */
-}
+void tb_target_set_jmp_target(const TranslationBlock *, int,
+                              uintptr_t, uintptr_t);
 
 /* This defines the natural memory order supported by this
  * architecture before guarantees made by various barrier
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1
 
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_DEFAULT_MO (0)
 
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
 /* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t)
     QEMU_ERROR("code path is reachable");
 
 #define TCG_TARGET_NEED_LDST_LABELS
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_bitsel_vec       have_vsx
 #define TCG_TARGET_HAS_cmpsel_vec       0
 
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #endif
 
 /* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_DEFAULT_MO (0)
 
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
 
 #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
 
-static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                                            uintptr_t jmp_rw, uintptr_t addr)
-{
-    /* patch the branch destination */
-    intptr_t disp = addr - (jmp_rx - 2);
-    qatomic_set((int32_t *)jmp_rw, disp / 2);
-    /* no need to flush icache explicitly */
-}
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw);
 
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
 /* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t, uintptr_t);
 
 #endif /* TCG_TARGET_H */
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
 {
     tb->jmp_target_addr[n] = addr;
     if (TCG_TARGET_HAS_direct_jump) {
+        /*
+         * Get the rx view of the structure, from which we find the
+         * executable code address, and tb_target_set_jmp_target can
+         * produce a pc-relative displacement to jmp_target_addr[n].
+         */
+        const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
         uintptr_t offset = tb->jmp_insn_offset[n];
-        uintptr_t tc_ptr = (uintptr_t)tb->tc.ptr;
-        uintptr_t jmp_rx = tc_ptr + offset;
+        uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
         uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
-        tb_target_set_jmp_target(tc_ptr, jmp_rx, jmp_rw, addr);
+        tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
     }
 }
 
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
     tcg_out_call_int(s, target);
 }
 
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                              uintptr_t jmp_rw, uintptr_t addr)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
+    uintptr_t addr = tb->jmp_target_addr[n];
     tcg_insn_unit i1, i2;
     TCGType rt = TCG_TYPE_I64;
     TCGReg  rd = TCG_REG_TMP;
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* patch the branch destination */
+    uintptr_t addr = tb->jmp_target_addr[n];
+    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
+    /* no need to flush icache explicitly */
+}
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg args[TCG_MAX_OP_ARGS],
                               const int const_args[TCG_MAX_OP_ARGS])
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop(TCGContext *s)
     tcg_out32(s, NOP);
 }
 
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                              uintptr_t jmp_rw, uintptr_t addr)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
     tcg_insn_unit i1, i2;
     ptrdiff_t upper, lower;
+    uintptr_t addr = tb->jmp_target_addr[n];
     ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
 
     if (offset == sextreg(offset, 0, 26)) {
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
     flush_idcache_range(rx, rw, 16);
 }
 
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                              uintptr_t jmp_rw, uintptr_t addr)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
     tcg_insn_unit i0, i1, i2, i3;
-    intptr_t tb_diff = addr - tc_ptr;
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t tb_diff = addr - (uintptr_t)tb->tc.ptr;
     intptr_t br_diff = addr - (jmp_rx + 4);
     intptr_t lo, hi;
 
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* patch the branch destination */
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t disp = addr - (jmp_rx - 2);
+    qatomic_set((int32_t *)jmp_rw, disp / 2);
+    /* no need to flush icache explicitly */
+}
+
 # define OP_32_64(x) \
         case glue(glue(INDEX_op_,x),_i32): \
         case glue(glue(INDEX_op_,x),_i64)
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tcg_register_jit(const void *buf, size_t buf_size)
     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
 }
 
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-                              uintptr_t jmp_rw, uintptr_t addr)
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
-    intptr_t tb_disp = addr - tc_ptr;
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t tb_disp = addr - (uintptr_t)tb->tc.ptr;
     intptr_t br_disp = addr - jmp_rx;
     tcg_insn_unit i1, i2;
 
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h            | 3 +++
 tcg/aarch64/tcg-target.h     | 4 ----
 tcg/arm/tcg-target.h         | 5 -----
 tcg/i386/tcg-target.h        | 3 ---
 tcg/loongarch64/tcg-target.h | 3 ---
 tcg/mips/tcg-target.h        | 5 -----
 tcg/ppc/tcg-target.h         | 4 ----
 tcg/riscv/tcg-target.h       | 4 ----
 tcg/s390x/tcg-target.h       | 4 ----
 tcg/sparc64/tcg-target.h     | 4 ----
 tcg/tci/tcg-target.h         | 4 ----
 11 files changed, 3 insertions(+), 40 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s);
 
 int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start);
 
+void tb_target_set_jmp_target(const TranslationBlock *, int,
+                              uintptr_t, uintptr_t);
+
 void tcg_set_frame(TCGContext *s, TCGReg reg, intptr_t start, intptr_t size);
 
 TCGTemp *tcg_global_mem_new_internal(TCGType, TCGv_ptr,
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     0
-
-void tb_target_set_jmp_target(const TranslationBlock *, int,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     0
-
-/* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
 #define TCG_TARGET_extract_i64_valid(ofs, len) \
     (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
 
-void tb_target_set_jmp_target(const TranslationBlock *, int,
-                              uintptr_t, uintptr_t);
-
 /* This defines the natural memory order supported by this
  * architecture before guarantees made by various barrier
  * instructions.
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1
 
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_DEFAULT_MO (0)
 
 #define TCG_TARGET_NEED_LDST_LABELS
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
-/* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t)
-    QEMU_ERROR("code path is reachable");
-
 #define TCG_TARGET_NEED_LDST_LABELS
 
 #endif
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_bitsel_vec       have_vsx
 #define TCG_TARGET_HAS_cmpsel_vec       0
 
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
-
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_mulsh_i64        1
 #endif
 
-/* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_DEFAULT_MO (0)
 
 #define TCG_TARGET_NEED_LDST_LABELS
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
 #define TCG_TARGET_HAS_MEMORY_BSWAP   1
 
 #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw);
-
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #define TCG_TARGET_NEED_POOL_LABELS
 
 #endif
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
-/* not defined -- call should be eliminated at compile time */
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t, uintptr_t);
-
 #endif /* TCG_TARGET_H */
-- 
2.34.1

Install empty versions for !TCG_TARGET_HAS_direct_jump hosts.

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* Always indirect, nothing to do */
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* Always indirect, nothing to do */
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* Always indirect, nothing to do */
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    /* Always indirect, nothing to do */
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
-- 
2.34.1

We now have the option to generate direct or indirect
goto_tb depending on the dynamic displacement, thus
the define is no longer necessary or completely accurate.

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_muls2_i64        0
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1
-#define TCG_TARGET_HAS_direct_jump      1
 
 #define TCG_TARGET_HAS_v64              1
 #define TCG_TARGET_HAS_v128             1
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_mulsh_i32        0
 #define TCG_TARGET_HAS_div_i32          use_idiv_instructions
 #define TCG_TARGET_HAS_rem_i32          0
-#define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #define TCG_TARGET_HAS_v64              use_neon_instructions
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
 #define TCG_TARGET_HAS_muls2_i32        1
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_direct_jump      1
 
 #if TCG_TARGET_REG_BITS == 64
 /* Keep target addresses zero-extended in a register.  */
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_clz_i32          1
 #define TCG_TARGET_HAS_ctz_i32          1
 #define TCG_TARGET_HAS_ctpop_i32        0
-#define TCG_TARGET_HAS_direct_jump      1
 #define TCG_TARGET_HAS_brcond2          0
 #define TCG_TARGET_HAS_setcond2         0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_muluh_i32        1
 #define TCG_TARGET_HAS_mulsh_i32        1
 #define TCG_TARGET_HAS_bswap32_i32      1
-#define TCG_TARGET_HAS_direct_jump      0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_add2_i32         0
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_muls2_i32        0
 #define TCG_TARGET_HAS_muluh_i32        1
 #define TCG_TARGET_HAS_mulsh_i32        1
-#define TCG_TARGET_HAS_direct_jump      1
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #if TCG_TARGET_REG_BITS == 64
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_clz_i32          0
 #define TCG_TARGET_HAS_ctz_i32          0
 #define TCG_TARGET_HAS_ctpop_i32        0
-#define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_brcond2          1
 #define TCG_TARGET_HAS_setcond2         1
 #define TCG_TARGET_HAS_qemu_st8_i32     0
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
 #define TCG_TARGET_HAS_mulsh_i32      0
 #define TCG_TARGET_HAS_extrl_i64_i32  0
 #define TCG_TARGET_HAS_extrh_i64_i32  0
-#define TCG_TARGET_HAS_direct_jump    1
 #define TCG_TARGET_HAS_qemu_st8_i32   0
 
 #define TCG_TARGET_HAS_div2_i64       1
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
 #define TCG_TARGET_HAS_muls2_i32        1
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_direct_jump      1
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #define TCG_TARGET_HAS_extrl_i64_i32    1
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_TARGET_HAS_muls2_i32        1
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #if TCG_TARGET_REG_BITS == 64
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
 
 void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
 {
+    /*
+     * Get the rx view of the structure, from which we find the
+     * executable code address, and tb_target_set_jmp_target can
+     * produce a pc-relative displacement to jmp_target_addr[n].
+     */
+    const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
+    uintptr_t offset = tb->jmp_insn_offset[n];
+    uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
+    uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
+
     tb->jmp_target_addr[n] = addr;
-    if (TCG_TARGET_HAS_direct_jump) {
-        /*
-         * Get the rx view of the structure, from which we find the
-         * executable code address, and tb_target_set_jmp_target can
-         * produce a pc-relative displacement to jmp_target_addr[n].
-         */
-        const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
-        uintptr_t offset = tb->jmp_insn_offset[n];
-        uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
-        uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
-        tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
-    }
+    tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
 }
 
 static inline void tb_add_jump(TranslationBlock *tb, int n,
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void G_GNUC_UNUSED set_jmp_insn_offset(TCGContext *s, int which)
      * We will check for overflow at the end of the opcode loop in
      * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
      */
-    tcg_debug_assert(TCG_TARGET_HAS_direct_jump);
     s->gen_tb->jmp_insn_offset[which] = tcg_current_code_size(s);
 }
 
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     intptr_t ptr, dif, dil;
     TCGReg base = TCG_REG_PC;
 
-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
     ptr = get_jmp_target_addr(s, which);
     dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
     dil = sextract32(dif, 0, 12);
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
     /* indirect jump method */
-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
                get_jmp_target_addr(s, which));
     tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
     /* indirect jump method */
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
                get_jmp_target_addr(s, which));
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
                               uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
+    if (!HAVE_FACILITY(GEN_INST_EXT)) {
+        return;
+    }
     /* patch the branch destination */
     uintptr_t addr = tb->jmp_target_addr[n];
     intptr_t disp = addr - (jmp_rx - 2);
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    qemu_build_assert(!TCG_TARGET_HAS_direct_jump);
     /* indirect jump method. */
     tcg_out_op_p(s, INDEX_op_goto_tb, (void *)get_jmp_target_addr(s, which));
     set_jmp_reset_offset(s, which);
-- 
2.34.1

The old implementation replaces two insns, swapping between

b	<dest>
	nop
	br	x30
and
	adrp	x30, <dest>
	addi	x30, x30, lo12:<dest>
	br	x30

There is a race condition in which a thread could be stopped at
the PC of the second insn, and when restarted does not see the
complete address computation and branches to nowhere.

The new implemetation replaces only one insn, swapping between

b	<dest>
	br	tmp
and
	ldr	tmp, <jmp_addr>
	br	tmp

Reported-by: hev <r@hev.cc>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.h     |  2 +-
 tcg/aarch64/tcg-target.c.inc | 66 +++++++++++++++---------------------
 2 files changed, 29 insertions(+), 39 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 
 #define TCG_TARGET_INSN_UNIT_SIZE  4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
-#define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
+#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 
 typedef enum {
     TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
     tcg_out_call_int(s, target);
 }
 
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-{
-    uintptr_t addr = tb->jmp_target_addr[n];
-    tcg_insn_unit i1, i2;
-    TCGType rt = TCG_TYPE_I64;
-    TCGReg  rd = TCG_REG_TMP;
-    uint64_t pair;
-
-    ptrdiff_t offset = addr - jmp_rx;
-
-    if (offset == sextract64(offset, 0, 26)) {
-        i1 = I3206_B | ((offset >> 2) & 0x3ffffff);
-        i2 = NOP;
-    } else {
-        offset = (addr >> 12) - (jmp_rx >> 12);
-
-        /* patch ADRP */
-        i1 = I3406_ADRP | (offset & 3) << 29 | (offset & 0x1ffffc) << (5 - 2) | rd;
-        /* patch ADDI */
-        i2 = I3401_ADDI | rt << 31 | (addr & 0xfff) << 10 | rd << 5 | rd;
-    }
-    pair = (uint64_t)i2 << 32 | i1;
-    qatomic_set((uint64_t *)jmp_rw, pair);
-    flush_idcache_range(jmp_rx, jmp_rw, 8);
-}
-
 static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
 {
     if (!l->has_value) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
     /*
-     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
-     * write can be used to patch the target address.
+     * Direct branch, or indirect address load, will be patched
+     * by tb_target_set_jmp_target.  Assert indirect load offset
+     * in range early, regardless of direct branch distance.
      */
-    if ((uintptr_t)s->code_ptr & 7) {
-        tcg_out32(s, NOP);
-    }
+    intptr_t i_off = tcg_pcrel_diff(s, (void *)get_jmp_target_addr(s, which));
+    tcg_debug_assert(i_off == sextract64(i_off, 0, 21));
+
     set_jmp_insn_offset(s, which);
-    /*
-     * actual branch destination will be patched by
-     * tb_target_set_jmp_target later
-     */
-    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
-    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
+    tcg_out32(s, I3206_B);
     tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    uintptr_t d_addr = tb->jmp_target_addr[n];
+    ptrdiff_t d_offset = d_addr - jmp_rx;
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or indirect branch load. */
+    if (d_offset == sextract64(d_offset, 0, 28)) {
+        insn = deposit32(I3206_B, 0, 26, d_offset >> 2);
+    } else {
+        uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
+        ptrdiff_t i_offset = i_addr - jmp_rx;
+
+        /* Note that we asserted this in range in tcg_out_goto_tb. */
+        insn = deposit32(I3305_LDR | TCG_REG_TMP, 0, 5, i_offset >> 2);
+    }
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
-- 
2.34.1

The old ppc64 implementation replaces 2 or 4 insns, which leaves a race
condition in which a thread could be stopped at a PC in the middle of
the sequence, and when restarted does not see the complete address
computation and branches to nowhere.

The new implemetation replaces only one insn, swapping between

b       <dest>
and
	mtctr	r31

falling through to a general-case indirect branch.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.h     |   3 +-
 tcg/ppc/tcg-target.c.inc | 158 +++++++++++----------------------------
 2 files changed, 44 insertions(+), 117 deletions(-)

diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 
 #ifdef _ARCH_PPC64
 # define TCG_TARGET_REG_BITS  64
-# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
 #else
 # define TCG_TARGET_REG_BITS  32
-# define MAX_CODE_GEN_BUFFER_SIZE  (32 * MiB)
 #endif
+#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 
 #define TCG_TARGET_NB_REGS 64
 #define TCG_TARGET_INSN_UNIT_SIZE 4
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
     tcg_out32(s, insn);
 }
 
-static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2)
-{
-    if (HOST_BIG_ENDIAN) {
-        return (uint64_t)i1 << 32 | i2;
-    }
-    return (uint64_t)i2 << 32 | i1;
-}
-
-static inline void ppc64_replace2(uintptr_t rx, uintptr_t rw,
-                                  tcg_insn_unit i0, tcg_insn_unit i1)
-{
-#if TCG_TARGET_REG_BITS == 64
-    qatomic_set((uint64_t *)rw, make_pair(i0, i1));
-    flush_idcache_range(rx, rw, 8);
-#else
-    qemu_build_not_reached();
-#endif
-}
-
-static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
-                                  tcg_insn_unit i0, tcg_insn_unit i1,
-                                  tcg_insn_unit i2, tcg_insn_unit i3)
-{
-    uint64_t p[2];
-
-    p[!HOST_BIG_ENDIAN] = make_pair(i0, i1);
-    p[HOST_BIG_ENDIAN] = make_pair(i2, i3);
-
-    /*
-     * There's no convenient way to get the compiler to allocate a pair
-     * of registers at an even index, so copy into r6/r7 and clobber.
-     */
-    asm("mr  %%r6, %1\n\t"
-        "mr  %%r7, %2\n\t"
-        "stq %%r6, %0"
-        : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7");
-    flush_idcache_range(rx, rw, 16);
-}
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-{
-    tcg_insn_unit i0, i1, i2, i3;
-    uintptr_t addr = tb->jmp_target_addr[n];
-    intptr_t tb_diff = addr - (uintptr_t)tb->tc.ptr;
-    intptr_t br_diff = addr - (jmp_rx + 4);
-    intptr_t lo, hi;
-
-    if (TCG_TARGET_REG_BITS == 32) {
-        intptr_t diff = addr - jmp_rx;
-        tcg_debug_assert(in_range_b(diff));
-        qatomic_set((uint32_t *)jmp_rw, B | (diff & 0x3fffffc));
-        flush_idcache_range(jmp_rx, jmp_rw, 4);
-        return;
-    }
-
-    /*
-     * For 16-bit displacements, we can use a single add + branch.
-     * This happens quite often.
-     */
-    if (tb_diff == (int16_t)tb_diff) {
-        i0 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
-        i1 = B | (br_diff & 0x3fffffc);
-        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
-        return;
-    }
-
-    lo = (int16_t)tb_diff;
-    hi = (int32_t)(tb_diff - lo);
-    assert(tb_diff == hi + lo);
-    i0 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
-    i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
-
-    /*
-     * Without stq from 2.07, we can only update two insns,
-     * and those must be the ones that load the target address.
-     */
-    if (!have_isa_2_07) {
-        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
-        return;
-    }
-
-    /*
-     * For 26-bit displacements, we can use a direct branch.
-     * Otherwise we still need the indirect branch, which we
-     * must restore after a potential direct branch write.
-     */
-    br_diff -= 4;
-    if (in_range_b(br_diff)) {
-        i2 = B | (br_diff & 0x3fffffc);
-        i3 = NOP;
-    } else {
-        i2 = MTSPR | RS(TCG_REG_TB) | CTR;
-        i3 = BCCTR | BO_ALWAYS;
-    }
-    ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3);
-}
-
 static void tcg_out_call_int(TCGContext *s, int lk,
                              const tcg_insn_unit *target)
 {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    /* Direct jump. */
-    if (TCG_TARGET_REG_BITS == 64) {
-        /* Ensure the next insns are 8 or 16-byte aligned. */
-        while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
-            tcg_out32(s, NOP);
-        }
+    uintptr_t ptr = get_jmp_target_addr(s, which);
+
+    if (USE_REG_TB) {
+        ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
+        tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
+    
+        /* Direct branch will be patched by tb_target_set_jmp_target. */
         set_jmp_insn_offset(s, which);
-        tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
-        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
+
+        /* When branch is out of range, fall through to indirect. */
+        tcg_out32(s, BCCTR | BO_ALWAYS);
+
+        /* For the unlinked case, need to reset TCG_REG_TB.  */
+        set_jmp_reset_offset(s, which);
+        tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
+                         -tcg_current_code_size(s));
+    } else {
+        /* Direct branch will be patched by tb_target_set_jmp_target. */
+        set_jmp_insn_offset(s, which);
+        tcg_out32(s, NOP);
+
+        /* When branch is out of range, fall through to indirect. */
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
+        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
+        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
         tcg_out32(s, BCCTR | BO_ALWAYS);
         set_jmp_reset_offset(s, which);
-        if (USE_REG_TB) {
-            /* For the unlinked case, need to reset TCG_REG_TB.  */
-            tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
-                             -tcg_current_code_size(s));
-        }
-    } else {
-        set_jmp_insn_offset(s, which);
-        tcg_out32(s, B);
-        set_jmp_reset_offset(s, which);
     }
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t diff = addr - jmp_rx;
+    tcg_insn_unit insn;
+
+    if (in_range_b(diff)) {
+        insn = B | (diff & 0x3fffffc);
+    } else if (USE_REG_TB) {
+        insn = MTSPR | RS(TCG_REG_TB) | CTR;
+    } else {
+        insn = NOP;
+    }
+
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
-- 
2.34.1

This is always true for sparc64, so this is dead since 3a5f6805c7ca.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc64/tcg-target.c.inc | 62 ++++++++++++------------------------
 1 file changed, 21 insertions(+), 41 deletions(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #endif
 
 #define TCG_REG_TB  TCG_REG_I1
-#define USE_REG_TB  (sizeof(void *) > 4)
 
 static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_L0,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
     }
 
     /* A 13-bit constant relative to the TB.  */
-    if (!in_prologue && USE_REG_TB) {
+    if (!in_prologue) {
         test = tcg_tbrel_diff(s, (void *)arg);
         if (check_fit_ptr(test, 13)) {
             tcg_out_arithi(s, ret, TCG_REG_TB, test, ARITH_ADD);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
     }
 
     /* Use the constant pool, if possible. */
-    if (!in_prologue && USE_REG_TB) {
+    if (!in_prologue) {
         new_pool_label(s, arg, R_SPARC_13, s->code_ptr,
                        tcg_tbrel_diff(s, NULL));
         tcg_out32(s, LDX | INSN_RD(ret) | INSN_RS1(TCG_REG_TB));
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
 #endif
 
     /* We choose TCG_REG_TB such that no move is required.  */
-    if (USE_REG_TB) {
-        QEMU_BUILD_BUG_ON(TCG_REG_TB != TCG_REG_I1);
-        tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);
-    }
+    QEMU_BUILD_BUG_ON(TCG_REG_TB != TCG_REG_I1);
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);
 
     tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I1, 0, JMPL);
     /* delay slot */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
         tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
         tcg_out_movi_imm13(s, TCG_REG_O0, a0);
         return;
-    } else if (USE_REG_TB) {
+    } else {
         intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
         if (check_fit_ptr(tb_diff, 13)) {
             tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
+    int c;
+
     /* Direct jump. */
-    if (USE_REG_TB) {
-        /* make sure the patch is 8-byte aligned.  */
-        if ((intptr_t)s->code_ptr & 4) {
-            tcg_out_nop(s);
-        }
-        set_jmp_insn_offset(s, which);
-        tcg_out_sethi(s, TCG_REG_T1, 0);
-        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-        tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-    } else {
-        set_jmp_insn_offset(s, which);
-        tcg_out32(s, CALL);
+    /* make sure the patch is 8-byte aligned.  */
+    if ((intptr_t)s->code_ptr & 4) {
         tcg_out_nop(s);
     }
+    set_jmp_insn_offset(s, which);
+    tcg_out_sethi(s, TCG_REG_T1, 0);
+    tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
+    tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
+    tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
     set_jmp_reset_offset(s, which);
 
     /*
      * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
      * to the beginning of this TB.
      */
-    if (USE_REG_TB) {
-        int c = -tcg_current_code_size(s);
-        if (check_fit_i32(c, 13)) {
-            tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
-            tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
-        }
+    c = -tcg_current_code_size(s);
+    if (check_fit_i32(c, 13)) {
+        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
+        tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     switch (opc) {
     case INDEX_op_goto_ptr:
         tcg_out_arithi(s, TCG_REG_G0, a0, 0, JMPL);
-        if (USE_REG_TB) {
-            tcg_out_mov_delay(s, TCG_REG_TB, a0);
-        } else {
-            tcg_out_nop(s);
-        }
+        tcg_out_mov_delay(s, TCG_REG_TB, a0);
         break;
     case INDEX_op_br:
         tcg_out_bpcc(s, COND_A, BPCC_PT, arg_label(a0));
@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
     tcg_debug_assert(tb_disp == (int32_t)tb_disp);
     tcg_debug_assert(br_disp == (int32_t)br_disp);
 
-    if (!USE_REG_TB) {
-        qatomic_set((uint32_t *)jmp_rw,
-		    deposit32(CALL, 0, 30, br_disp >> 2));
-        flush_idcache_range(jmp_rx, jmp_rw, 4);
-        return;
-    }
-
     /* This does not exercise the range of the branch, but we do
        still need to be able to load the new value of TCG_REG_TB.
        But this does still happen quite often.  */
-- 
2.34.1

The old sparc64 implementation may replace two insns, which leaves
a race condition in which a thread could be stopped at a PC in the
middle of the sequence, and when restarted does not see the complete
address computation and branches to nowhere.

The new implemetation replaces only one insn, swapping between a
direct branch and a direct call.  The TCG_REG_TB register is loaded
from tb->jmp_target_addr[] in the delay slot.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc64/tcg-target.c.inc | 87 +++++++++++++++---------------------
 1 file changed, 37 insertions(+), 50 deletions(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    int c;
+    ptrdiff_t off = tcg_tbrel_diff(s, (void *)get_jmp_target_addr(s, which));
 
-    /* Direct jump. */
-    /* make sure the patch is 8-byte aligned.  */
-    if ((intptr_t)s->code_ptr & 4) {
-        tcg_out_nop(s);
-    }
+    /* Direct branch will be patched by tb_target_set_jmp_target. */
     set_jmp_insn_offset(s, which);
-    tcg_out_sethi(s, TCG_REG_T1, 0);
-    tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 0, ARITH_OR);
-    tcg_out_arith(s, TCG_REG_G0, TCG_REG_TB, TCG_REG_T1, JMPL);
-    tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
+    tcg_out32(s, CALL);
+    /* delay slot */
+    tcg_debug_assert(check_fit_ptr(off, 13));
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, TCG_REG_TB, off);
     set_jmp_reset_offset(s, which);
 
     /*
      * For the unlinked path of goto_tb, we need to reset TCG_REG_TB
      * to the beginning of this TB.
      */
-    c = -tcg_current_code_size(s);
-    if (check_fit_i32(c, 13)) {
-        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
+    off = -tcg_current_code_size(s);
+    if (check_fit_i32(off, 13)) {
+        tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, off, ARITH_ADD);
     } else {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, c);
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, off);
         tcg_out_arith(s, TCG_REG_TB, TCG_REG_TB, TCG_REG_T1, ARITH_ADD);
     }
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    uintptr_t addr = tb->jmp_target_addr[n];
+    intptr_t br_disp = (intptr_t)(addr - jmp_rx) >> 2;
+    tcg_insn_unit insn;
+
+    br_disp >>= 2;
+    if (check_fit_ptr(br_disp, 19)) {
+        /* ba,pt %icc, addr */
+        insn = deposit32(INSN_OP(0) | INSN_OP2(1) | INSN_COND(COND_A)
+                         | BPCC_ICC | BPCC_PT, 0, 19, br_disp);
+    } else if (check_fit_ptr(br_disp, 22)) {
+        /* ba addr */
+        insn = deposit32(INSN_OP(0) | INSN_OP2(2) | INSN_COND(COND_A),
+                         0, 22, br_disp);
+    } else {
+        /* The code_gen_buffer can't be larger than 2GB.  */
+        tcg_debug_assert(check_fit_ptr(br_disp, 30));
+        /* call addr */
+        insn = deposit32(CALL, 0, 30, br_disp);
+    }
+
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
@@ -XXX,XX +XXX,XX @@ void tcg_register_jit(const void *buf, size_t buf_size)
 {
     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
 }
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-{
-    uintptr_t addr = tb->jmp_target_addr[n];
-    intptr_t tb_disp = addr - (uintptr_t)tb->tc.ptr;
-    intptr_t br_disp = addr - jmp_rx;
-    tcg_insn_unit i1, i2;
-
-    /* We can reach the entire address space for ILP32.
-       For LP64, the code_gen_buffer can't be larger than 2GB.  */
-    tcg_debug_assert(tb_disp == (int32_t)tb_disp);
-    tcg_debug_assert(br_disp == (int32_t)br_disp);
-
-    /* This does not exercise the range of the branch, but we do
-       still need to be able to load the new value of TCG_REG_TB.
-       But this does still happen quite often.  */
-    if (check_fit_ptr(tb_disp, 13)) {
-        /* ba,pt %icc, addr */
-        i1 = (INSN_OP(0) | INSN_OP2(1) | INSN_COND(COND_A)
-              | BPCC_ICC | BPCC_PT | INSN_OFF19(br_disp));
-        i2 = (ARITH_ADD | INSN_RD(TCG_REG_TB) | INSN_RS1(TCG_REG_TB)
-              | INSN_IMM13(tb_disp));
-    } else if (tb_disp >= 0) {
-        i1 = SETHI | INSN_RD(TCG_REG_T1) | ((tb_disp & 0xfffffc00) >> 10);
-        i2 = (ARITH_OR | INSN_RD(TCG_REG_T1) | INSN_RS1(TCG_REG_T1)
-              | INSN_IMM13(tb_disp & 0x3ff));
-    } else {
-        i1 = SETHI | INSN_RD(TCG_REG_T1) | ((~tb_disp & 0xfffffc00) >> 10);
-        i2 = (ARITH_XOR | INSN_RD(TCG_REG_T1) | INSN_RS1(TCG_REG_T1)
-              | INSN_IMM13((tb_disp & 0x3ff) | -0x400));
-    }
-
-    qatomic_set((uint64_t *)jmp_rw, deposit64(i2, 32, 32, i1));
-    flush_idcache_range(jmp_rx, jmp_rw, 8);
-}
-- 
2.34.1

Now that tcg can handle direct and indirect goto_tb
simultaneously, we can optimistically leave space for
a direct branch and fall back to loading the pointer
from the TB for an indirect branch.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 52 ++++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     ARITH_BIC = 0xe << 21,
     ARITH_MVN = 0xf << 21,
 
+    INSN_B         = 0x0a000000,
+
     INSN_CLZ       = 0x016f0f10,
     INSN_RBIT      = 0x06ff0f30,
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 
 static void tcg_out_b_imm(TCGContext *s, ARMCond cond, int32_t offset)
 {
-    tcg_out32(s, (cond << 28) | 0x0a000000 |
+    tcg_out32(s, (cond << 28) | INSN_B |
                     (((offset - 8) >> 2) & 0x00ffffff));
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    /* Indirect jump method */
-    intptr_t ptr, dif, dil;
-    TCGReg base = TCG_REG_PC;
+    uintptr_t i_addr;
+    intptr_t i_disp;
 
-    ptr = get_jmp_target_addr(s, which);
-    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
-    dil = sextract32(dif, 0, 12);
-    if (dif != dil) {
+    /* Direct branch will be patched by tb_target_set_jmp_target. */
+    set_jmp_insn_offset(s, which);
+    tcg_out32(s, INSN_NOP);
+
+    /* When branch is out of range, fall through to indirect. */
+    i_addr = get_jmp_target_addr(s, which);
+    i_disp = tcg_pcrel_diff(s, (void *)i_addr) - 8;
+    tcg_debug_assert(i_disp < 0);
+    if (i_disp >= -0xfff) {
+        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, i_disp);
+    } else {
         /*
          * The TB is close, but outside the 12 bits addressable by
          * the load.  We can extend this to 20 bits with a sub of a
-         * shifted immediate from pc.  In the vastly unlikely event
-         * the code requires more than 1MB, we'll use 2 insns and
-         * be no worse off.
+         * shifted immediate from pc.
          */
-        base = TCG_REG_R0;
-        tcg_out_movi32(s, COND_AL, base, ptr - dil);
+        int h = -i_disp;
+        int l = h & 0xfff;
+
+        h = encode_imm_nofail(h - l);
+        tcg_out_dat_imm(s, COND_AL, ARITH_SUB, TCG_REG_R0, TCG_REG_PC, h);
+        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, l);
     }
-    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
     set_jmp_reset_offset(s, which);
 }
 
 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
                               uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
-    /* Always indirect, nothing to do */
+    uintptr_t addr = tb->jmp_target_addr[n];
+    ptrdiff_t offset = addr - (jmp_rx + 8);
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or fall through to indirect branch. */
+    if (offset == sextract64(offset, 0, 26)) {
+        /* B <addr> */
+        insn = deposit32((COND_AL << 28) | INSN_B, 0, 24, offset >> 2);
+    } else {
+        insn = INSN_NOP;
+    }
+
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-- 
2.34.1

Now that tcg can handle direct and indirect goto_tb simultaneously,
we can optimistically leave space for a direct branch and fall back
to loading the pointer from the TB for an indirect branch.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/riscv/tcg-target.c.inc | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    /* indirect jump method */
+    /* Direct branch will be patched by tb_target_set_jmp_target. */
+    set_jmp_insn_offset(s, which);
+    tcg_out32(s, OPC_JAL);
+
+    /* When branch is out of range, fall through to indirect. */
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
                get_jmp_target_addr(s, which));
     tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_TMP0, 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
                               uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
-    /* Always indirect, nothing to do */
+    uintptr_t addr = tb->jmp_target_addr[n];
+    ptrdiff_t offset = addr - jmp_rx;
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or fall through to indirect branch. */
+    if (offset == sextreg(offset, 0, 20)) {
+        insn = encode_uj(OPC_JAL, TCG_REG_ZERO, offset);
+    } else {
+        insn = OPC_NOP;
+    }
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-- 
2.34.1